def tabelog(url: str) -> Dict[str, str]: result_dict: Dict[str, str] = {} soup = U.fetch_url(url) n_rst_pages = _get_total_page_num(soup) page_number = random.choice(n_rst_pages) + 1 soup = U.fetch_one_page(url, page_number) for _ in range(C.MAX_RETRY): try: rst_url, name = _random_choice_rst(soup) result_dict["rst_url"] = rst_url rst_url = urllib.parse.urljoin(rst_url, C.PHOTO_SUBDIR_TABELOG) + "/" result_dict["rst_name"] = name img_soup = U.fetch_one_page(rst_url, 1) n_img_pages = _get_total_page_num(img_soup) page_number = random.choice(n_img_pages) + 1 img_page_url = rst_url + "1/smp0/D-normal/" + str(page_number) img_soup = U.fetch_url(img_page_url) img_url, title = _random_choice_img(img_soup) break except urllib.error.HTTPError: continue result_dict["img_name"] = title result_dict["img_url"] = img_url return result_dict
def main(): html1 = utils.fetch_url(url_1) html2 = utils.fetch_url(url_2) html3 = utils.fetch_url(url_3) html4 = utils.fetch_url(url_4) process1(html1) process2(html2) process3(html3) process3(html4) save(txt_filename, url_list)
def get_title(url): if not is_valid_url(url): return host = urlparse(url).netloc try: ip = socket.gethostbyname(host) except: return if is_not_public(ip): return data = fetch_url(url) page = data.read(4096) if page == '': return title_match = title_regex.search(page) if title_match: title = title_match.group(1) title = title.strip().replace('\n', '') return unescape(title)
def extract_stock2(code): fields = { 'sjl' : '592920', 'zsz' : '3541450', 'ltsz' : '3475914', } cnx = mysqllib.get_connection() cursor = cnx.cursor() url = "http://d.10jqka.com.cn/v2/realhead/hs_%s/last.js"%(code) data = utils.fetch_url(url) data = re.sub(r'quotebridge.*?\((.*)\)', r'\1', data) jo = json.loads(data)['items'] if jo is not None: try: jo['3541450'] = "%.2f"%(float(jo['3541450']) / 100000000) jo['3475914'] = "%.2f"%(float(jo['3475914']) / 100000000) keys = fields.keys() vals = ["'"+ (jo[fields[k]] or '')+"'" for k in keys] keys.append('code') vals.append("'%s'"%(code)) updates = [keys[i]+"="+vals[i] for i in range(0, len(keys))] except: utils.print_with_time("url=%s"%(url)) traceback.print_exc() return sql = "INSERT INTO stock (%s) VALUES (%s) ON DUPLICATE KEY UPDATE %s"%(', '.join(keys), ', '.join(vals), ', '.join(updates)) # print sql cursor.execute(sql) cnx.commit() cursor.close() cnx.close()
def search_results(self): url = self.context.getRemoteUrl() search_term = urllib.quote_plus(self.searchterm) if self.has_searchterm(): if not search_term: return [] else: qurl = substitute_parameters(url, self.request.form) else: qurl = url rd = fetch_url(qurl) results = rd['result'] if rd['type'] == 'feed': try: self.total_results = int(results.feed.get('opensearch_totalresults','0')) if self.total_results == 0: self.total_results = int(results.feed.get('totalresults','0')) except ValueError: pass for link in results.feed.get('links', []): if (link['rel']=='alternate') and (link['type']=='text/html'): self.feed_html_link = link['href'] self.feed_title = results.feed.get('title', '') return results['entries'] elif rd['type'] == 'kml': return parse_kml(results) else: return []
def search_results(self): url = self.context.getRemoteUrl() search_term = urllib.quote_plus(self.searchterm) if self.has_searchterm(): if not search_term: return [] else: qurl = substitute_parameters(url, self.request.form) else: qurl = url rd = fetch_url(qurl) results = rd['result'] if rd['type'] == 'feed': try: self.total_results = int( results.feed.get('opensearch_totalresults', '0')) if self.total_results == 0: self.total_results = int( results.feed.get('totalresults', '0')) except ValueError: pass for link in results.feed.get('links', []): if (link['rel'] == 'alternate') and (link['type'] == 'text/html'): self.feed_html_link = link['href'] self.feed_title = results.feed.get('title', '') return results['entries'] elif rd['type'] == 'kml': return parse_kml(results) else: return []
def _process_image(self, img): pic = pq(img).attr('src') # Attachment if pic == 'static/image/common/none.gif': pic = 'http://www.lightnovel.cn/{}'.format(pq(img).attr('file')) if pic.startswith('http'): # Resize/divide image if necessary try: image_buffer = StringIO(fetch_url(pic)) image = Image.open(image_buffer) # Grayscale size saving too little #image = ImageOps.grayscale(image) if image.size[0] > image.size[1]: image = image.rotate(90) image.thumbnail((600, 800), Image.ANTIALIAS) filename = self._add_image(image) pq(img).attr('src', filename) pq(img).attr('width', str(image.size[0])) pq(img).attr('height', str(image.size[1])) image_buffer.close() except HTTPError: print 'Cannot find image: {}'.format(pic)
def _turn_page(self): parts = self.url.split('-') parts[2] = str(self.next_page) url = '-'.join(parts) self.page = fetch_url(url).decode('gbk', 'replace') self.d = pq(self.page) self.next_page += 1
def get_current_streams(): """Returns a list of streams (with only relevant keys)""" streams = list() afreeca_url = 'http://live.afreecatv.com:8057/afreeca/broad_list_api.php' # afreeca_url = 'http://localhost:8000/broad_list_api.php' afreeca_response = utils.fetch_url(afreeca_url) afreeca_json_str = format_afreeca_response_to_json(afreeca_response) json_object = json.loads(afreeca_json_str) time_format = '%Y-%m-%d %H:%M' time_offset = 9 for info in json_object['CHANNEL']['REAL_BROAD']: id = info['user_id'] viewers = int(info['total_view_cnt']) locked = info['is_password'] == 'Y' online_since = utils.get_utc_time(info['broad_start'], time_format, time_offset) image = info['broad_img'] stream = { 'type': 'afreeca', 'id': id, 'viewers': viewers, 'online_since': online_since, 'image': image, 'locked': locked } streams.append(stream) return streams
def get_title(url): if not is_valid_url(url): return host = urlparse(url).netloc try: ip = socket.gethostbyname(host) except: return if is_not_public(ip): return data = fetch_url(url) page = data.read(4096) if page == '': return title_match = title_regex.search(page) if title_match: title = title_match.group(1) title = title.strip().replace('\n', '') return unescape(title)
def hdp_albums(): return [{ 'label': r['title'], 'path': 'plugin://script.module.hdpparser?uri=' + quote_plus(r['url']), 'thumbnail': r.get('thumb'), } for r in json.loads(fetch_url('http://xbmc.hdpfans.com/albums.json')) ]
def hdp_albums(): return [{ 'label': r['title'], 'path': 'plugin://script.module.hdpparser?uri=' + quote_plus(r['url']), 'thumbnail': r.get('thumb'), } for r in json.loads(fetch_url('http://xbmc.hdpfans.com/albums.json')) ]
def get_uk(self): url = "http://pan.baidu.com/share/manage" content = fetch_url(url, headers={"Cookie": "BDUSS=" + self._bduss}) _RE = re.compile(r'<a class="homepagelink" href="http://pan.baidu.com/share/home\?uk=(\d+)"') # noqa uk = int(_RE.search(content).group(1)) return uk
def main(): for i in range(1, 11): url = 'https://pente.koro-pokemon.com/data/waza-list-{0}.shtml'.format( i) html = utils.fetch_url(url) process(html) utils.save(json_filename, new_moves_list)
def _fetch_clientapi(self, url, data=None, headers={}, need_auth=True): if need_auth and self._bduss: headers['Cookie'] = 'BDUSS=' + self._bduss content = fetch_url(url, data, headers) r = json.loads(content) if r.get('errno', 0) or r.get('error_code', 0): raise ClientApiError(r) return r
def hdp_albums(): return [ { "label": r["title"], "path": "plugin://script.module.hdpparser?uri=" + quote_plus(r["url"]), "thumbnail": r.get("thumb"), } for r in json.loads(fetch_url("http://xbmc.hdpfans.com/albums.json")) ]
def guess(): items = generate_items_from_page(fetch_url(urljoin(HOST, '/baidu.php'))) items.append({ 'label': '换一批', 'path': m.url_for('guess'), }) return m.plugin.finish(items, view_mode='thumbnail')
def get_uk(self): url = 'http://pan.baidu.com/share/manage' content = fetch_url(url, headers={'Cookie': 'BDUSS=' + self._bduss}) _RE = re.compile( r'<a class="homepagelink" href="http://pan.baidu.com/share/home\?uk=(\d+)"' ) # noqa uk = int(_RE.search(content).group(1)) return uk
def random_pair(): host = 'http://es.wikipedia.org/w/api.php?' parameters = 'format=json&action=query&list=random&rnnamespace=0&rnlimit=2' url = host + parameters json_file = fetch_url(url) page_dictionary = json.load(json_file) return page_dictionary['query']['random'][0]['title'].encode('utf-8'), page_dictionary['query']['random'][1]['title'].encode('utf-8')
def _fetch_clientapi(self, url, data=None, headers={}, need_auth=True): if need_auth and self._bduss: headers["Cookie"] = "BDUSS=" + self._bduss content = fetch_url(url, data, headers) r = json.loads(content) if r.get("errno", 0) or r.get("error_code", 0): raise ClientApiError(r) return r
def archive_character_data(self, graph): """ Fetch data for the nodes in the graph. Only including those which are online. """ nodes = graph.nodes() # Find only those characters who are not "OLD" # old_id = single_column(self.database_connection, # f"SELECT character_id FROM {self.table_name}_character_info WHERE last_login_date < ?", # (self.tSinceLogin,)) # remaining_nodes = [n for n in nodes if n not in old_id] remaining_nodes = nodes # Gets character attributes for each found in the friend lists archive_id = single_column( self.archive_connection, f"SELECT character_id FROM {self.table_name}_node ") remaining_nodes = [n for n in remaining_nodes if n not in archive_id] re_count = len(remaining_nodes) fetch_logger().info( f"Number of nodes in graph is: {len(nodes)} Number of unarchived nodes is: {re_count}" ) # Break the list up into chunks of 40 smallLists = chunks(remaining_nodes, CHARACTER_INFO_BATCH_SIZE) completed_jobs = 0 for character_id_batch in smallLists: character_ids = ",".join(character_id_batch) url = f"http://census.daybreakgames.com/s:{SERVICE_ID}/get/" \ f"{self.namespace}/character/?character_id={character_ids}" \ f"&c:resolve=outfit,name,stats,times,stat_history" fetch_logger().debug(f'fetching {url}') decoded = fetch_url(url) results = decoded["character_list"] for result in results: # Unpack the server response and add each to the archive_connection. try: self.archive_connection.execute( f"INSERT OR REPLACE into {self.table_name}_node (character_id,raw) VALUES(?,?)", (result["character_id"], json.dumps(result)), ) except Exception: fetch_logger().info("archive_connection failure") if "error" in str(decoded): fetch_logger().info("Server down") exit(1) else: raise self.archive_connection.commit() completed_jobs += len(character_id_batch) fetch_logger().info( f"looking up data completion is at {(completed_jobs / re_count) * 100.0} percent" )
def scrap2(doi): base_url = "http://libgen.lc/scimag/ads.php?doi=" html = utils.fetch_url(base_url + doi) for fila in html.split("\n"): try: clave = fila.index("http://booksdl.org/scimag/get.php?doi=" + doi) url = fila[clave:fila.index('"', clave)] return url except: pass
def random_pair(): global language host = 'http://%s.wikipedia.org/w/api.php?' % language parameters = 'format=json&action=query&list=random&rnnamespace=0&rnlimit=2' url = host + parameters json_file = fetch_url(url) page_dictionary = json.load(json_file) return [page_dictionary['query']['random'][i]['title'].encode('utf-8') for i in (0, 1)]
def search_result(keyword): return m.plugin.finish( generate_items_from_page( fetch_url( HOST + 'if.php', 'q=' + quote_plus(keyword) ) ), view_mode='thumbnail' )
def guess(): items = generate_items_from_page( fetch_url(urljoin(HOST, '/baidu.php'))) items.append({ 'label': '换一批', 'path': m.url_for('guess'), }) return m.plugin.finish(items, view_mode='thumbnail')
def get_stocks_data_from_google(symbols): import json url = 'http://finance.google.com/finance/info?client=ig&q={0}'.format(','.join(symbols)) print(url) r = fetch_url(url, 0) json_str = r.content.replace('/', '').strip() quotes = {} for quote in json.loads(json_str): quotes[quote['t'].upper()] = quote return quotes
def random_pair(): host = 'http://es.wikipedia.org/w/api.php?' parameters = 'format=json&action=query&list=random&rnnamespace=0&rnlimit=2' url = host + parameters json_file = fetch_url(url) page_dictionary = json.load(json_file) return page_dictionary['query']['random'][0]['title'].encode( 'utf-8'), page_dictionary['query']['random'][1]['title'].encode( 'utf-8')
def fetch_friend_lists_for_characters( namespace, character_list: List[str], problematic_character_ids: List[int]) -> List[dict]: """ Return the list of friend list responses from the server. Also return the list of character ids who couldn't be loaded due to errors! """ fetch_logger().info(f"fetch_friend_lists_for_characters {character_list}") # Attempt to build a url for this set of characters and handle errors encountered along the way. unique_characters = list(set(character_list)) if len(character_list) > 1: character_ids = ",".join(unique_characters) else: character_ids = str(character_list[0]) friend_list_results = [] url = f"http://census.daybreakgames.com/s:{SERVICE_ID}/get/{namespace}/characters_friend/" \ f"?character_id={character_ids}&c:resolve=world" try: decoded = fetch_url(url) friend_list_results = decoded["characters_friend_list"] except GiveUpException as possible_overload_error: # Some characters have errors when you load the friends list. unclear why. if len(character_list) > 1: fetch_logger().error( f"Unable to load large group of ids: {character_list}") fetch_logger().error(str(possible_overload_error)) for indi_index, individual in enumerate(character_list): fetch_logger().info( f"Attempting to run individual {indi_index} ({individual})" ) individual_results = fetch_friend_lists_for_characters( namespace, [individual], problematic_character_ids) if len(individual_results) > 0: friend_list_results.extend(individual_results) else: fetch_logger().warning( f"Unable to fetch data for player {individual} for whatever reason" ) elif len(character_list) == 1: problematic_character_ids.append(character_list) except Exception as err: fetch_logger().error( f"Unable to fetch friendlist for {character_list} {err} giving up and moving on" ) return friend_list_results
def fetch(self, gene): """Descarga el contenido de la página. NECESITA COOKIES PARA QUE ANDE.""" url = BASE_GENECARDS_URL + gene logging.debug("Fetching URL " + url) headers = { "User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0", "Cookie": "ASP.NET_SessionId=hwjs2brcx2cbpjod4gnxijlv; rvcn=H1d-sbuapwNh7VvK5OojnZx_GNvK8NV3Y_igQlT3X1auxx-CGR_50Kv2Gtv0Wo_OSexSGqUeuYFqw_sxZCt8GPagEGA1; ARRAffinity=166bde02ef81ff7e7ac9e9a57f0ef302100f353e9212ba930c859133d8b6d672; visid_incap_146342=ng+3dLHhQg+n22cnTIdnpyPQzV0AAAAAQUIPAAAAAADYkswfNl1m+lEO6s1+k/62; nlbi_146342=67fBdWPizFQhuWCUmewSQgAAAACX5WRNOa574GkShdKAhsHo; incap_ses_789_146342=QK2DWJBMgCysSvfvbRjzCiTQzV0AAAAA6eTm5xFAT0bApBqNQRJH0w==; _ga=GA1.2.752885262.1573769256; _gid=GA1.2.1465475919.1573769256; __gads=ID=bb532cbe1d9196bc:T=1573769276:S=ALNI_MZlxaQcjBdoHS5r7fq1pdgh3l_5cg; EU_COOKIE_LAW_CONSENT=true" } data = utils.fetch_url(url, headers) return data
def download(self, remotefile, localpath=None, newfile=None): url = self.get_download_url(remotefile) if localpath: localfile = os.path.join(localpath, newfile or os.path.basename(remotefile)) with open(localfile, 'wb') as fp: r = urllib2.urlopen(url) shutil.copyfileobj(r, fp) # or urllib retrieve return True else: return fetch_url(url)
def list_movies(link): content = fetch_url(urljoin(HOST, link)) items = generate_items_from_page(content) match = re.search( r"<a class='nextPage' href='(.+?)'>下页</a>", content) # noqa if match: items.append({ 'label': u'下一页', 'path': m.url_for('list_movies', link=match.group(1)) }) return m.plugin.finish(items, view_mode='thumbnail')
def show_detail(page_id): url = '%sview-%s.html' % (HOST, page_id) content = fetch_url(url) items = [{ 'label': ''.join(match.groups()[1:]), 'path': m.url_for('bdyun_link', link=url + match.group(1)) } for match in re.finditer(r'<a class="btn btn-inverse pull-left".*?href="(.+?)".*?>(.+?)<span class="baidusp">(.+?)</span>\s*<span class="baidusp2">(.+?)</span>', content)] # noqa if len(items) != 1: return m.plugin.finish(items) m.plugin.redirect(items[0]['path'])
def list_movies(link): content = fetch_url(urljoin(HOST, link)) items = generate_items_from_page(content) match = re.search(r"<a class='nextPage' href='(.+?)'>下页</a>", content) # noqa if match: items.append({ 'label': u'下一页', 'path': m.url_for('list_movies', link=match.group(1)) }) return m.plugin.finish(items, view_mode='thumbnail')
def download(self, remotefile, localpath=None, newfile=None): url = self.get_download_url(remotefile) if localpath: localfile = os.path.join(localpath, newfile or os.path.basename(remotefile)) with open(localfile, 'wb') as fp: r = urllib2.urlopen(url) shutil.copyfileobj(r, fp) # or urllib retrieve return True else: return fetch_url(url)
def get_titulo(pmid): ''' Dado un pmid extrae su título de Pubmed si este existe. ''' pubmed_url = "https://www.ncbi.nlm.nih.gov/pubmed/" html = fetch_url(pubmed_url + pmid) for fila in html.split("\n"): try: clave = fila.index("<title>") titulo = fila[clave + 7:fila.index('- PubMed - NCBI', clave + 7)] return titulo except: pass
def get_link(article): host = 'http://es.wikipedia.org/w/api.php?' parameters = 'format=json&action=query&prop=info&titles=' url = host + parameters + article json_file = fetch_url(url) link_dictionary = json.load(json_file) # Get page ID pageid = list(link_dictionary['query']['pages'])[0] # Page doesn't exist if ID is -1 if pageid == '-1': return return 'http://es.wikipedia.org/wiki/%s' % article
def get_abstract(pmid): ''' Dado un pmid extrae su abstract de Pubmed si este existe. ''' pubmed_url = "https://www.ncbi.nlm.nih.gov/pubmed/" html = fetch_url(pubmed_url + pmid) for fila in html.split("\n"): try: clave = fila.index('"abstr"') p = fila.index("<p>", clave) abstract = fila[p + 3:fila.index('</p>', p + 3)] return abstract except: pass
def get_doi(pmid): ''' Dado un pmid extrae su DOI de Pubmed si este existe. ''' pubmed_url = "https://www.ncbi.nlm.nih.gov/pubmed/" html = fetch_url(pubmed_url + pmid) for fila in html.split("\n"): try: doi_key = fila.index("DOI:") href = fila.index('href="', doi_key) doi = fila[href + 16:fila.index('"', href + 16)] return doi except: pass
def get_keywords(pmid): ''' Dado un pmid extrae su palabras clave de Pubmed si estas existen. ''' pubmed_url = "https://www.ncbi.nlm.nih.gov/pubmed/" html = fetch_url(pubmed_url + pmid) for fila in html.split("\n"): try: clave = fila.index("KEYWORDS:") p = fila.index("<p>", clave) abstract = fila[p + 3:fila.index('</p>', p + 3)] return abstract except: pass
def scrape_bill_document_from_sunlight(file_path): try: file_path = file_path.strip() #define path to write file out_file_path = file_path.split("/bills")[-1] out_file_path = re.sub("\s+", "_", out_file_path) out_dir_root_path = "{0}/scraped_bills".format(DATA_PATH) out_file_name = "{0}{1}.json".format(out_dir_root_path, out_file_path) bill_json = json.loads(codecs.open(file_path, encoding="utf8").read()) # filter versions to be only the first and last try: bill_json['versions'] = [bill_json['versions'][0], bill_json['versions'][-1]] except IndexError: return base_url = "{0}/{1}".format("http://static.openstates.org/documents", bill_json['state']) urls = ["{0}/{1}".format(base_url, x['doc_id']) for x in bill_json['versions']] source_urls = [x['url'] for x in bill_json['versions']] for i, url in enumerate(urls): bill_document = utils.fetch_url(url) #hash bill using base64 if bill_document is not None: bill_document = base64.b64encode(bill_document) else: logging.error("file {0}, url {1}, version {2}, error: << {3} >>".format( file_path, url, i, "link error")) bill_json['versions'][i]['bill_document'] = bill_document if not os.path.exists(os.path.dirname(out_file_name)): os.makedirs(os.path.dirname(out_file_name)) with codecs.open(out_file_name, "w", encoding="utf8") as f: f.write(json.dumps(bill_json)) logging.info("successfully scraped bill: {0}".format(out_file_path)) except Exception as e: trace_message = re.sub("\n+", "\t", traceback.format_exc()) trace_message = re.sub("\s+", " ", trace_message) trace_message = "<<{0}>>".format(trace_message) m = "Failed to obtain documents for {0}: {1}".format(file_path, trace_message) logging.error(m) return
def test_fetch_url(self): self.assertEqual(None, utils.fetch_url("no such thing")) self.assertEqual(None, utils.fetch_url("http://no-such-thing-hopefully.abc/asdasd123")) def urlopen_mock(html: bytes, code: int): class MockResponse: def read(self): return html def getcode(self): return code @contextmanager def urlopen(url): yield MockResponse() return urlopen with patch("urllib.request.urlopen", urlopen_mock(b"works", 200)): self.assertEqual(b"works", utils.fetch_url("http://canonical.com/")) # return `None` for any code >= 400 with patch("urllib.request.urlopen", urlopen_mock(b"some message", 400)): self.assertEqual(None, utils.fetch_url("http://canonical.com/"))
def show_detail(page_id): url = '%sview-%s.html' % (HOST, page_id) content = fetch_url(url) items = [{ 'label': ''.join(match.groups()[1:]), 'path': m.url_for('bdyun_link', link=url + match.group(1)) } for match in re.finditer( r'<a class="btn btn-inverse pull-left".*?href="(.+?)".*?>(.+?)<span class="baidusp">(.+?)</span>\s*<span class="baidusp2">(.+?)</span>', content)] # noqa if len(items) != 1: return m.plugin.finish(items) m.plugin.redirect(items[0]['path'])
def get_link(article): host = 'http://es.wikipedia.org/w/api.php?' parameters = 'format=json&action=query&prop=info&titles=' url = host + parameters + article json_file = fetch_url(url) link_dictionary = json.load(json_file) # Get page ID pageid = list(link_dictionary['query']['pages'])[0] # Page doesn't exist if ID is -1 if pageid == '-1': return return 'http://es.wikipedia.org/wiki/%s' % article
def fetch(article_name): host = 'http://es.wikipedia.org/w/api.php?' parameters = 'format=json&action=query&prop=extracts&exsentences=3&explaintext=true&titles=' url = host + parameters + urllib2.quote(article_name) json_file = fetch_url(url) page_dictionary = json.load(json_file) # Handle the varying pageid key in the JSON file provided by MediaWiki pageid = list(page_dictionary['query']['pages'])[0] # Page does not exist if pageid == '-1': return "The article %s doesn't exist." % (article_name) extract = page_dictionary['query']['pages'][pageid]['extract'] return extract.encode('utf-8')
def fetch(article_name): host = 'http://es.wikipedia.org/w/api.php?' parameters = 'format=json&action=query&prop=extracts&exsentences=3&explaintext=true&titles=' url = host + parameters + urllib2.quote(article_name) json_file = fetch_url(url) page_dictionary = json.load(json_file) # Handle the varying pageid key in the JSON file provided by MediaWiki pageid = list(page_dictionary['query']['pages'])[0] # Page does not exist if pageid == '-1': return "The article %s doesn't exist." % (article_name) extract = page_dictionary['query']['pages'][pageid]['extract'] return extract.encode('utf-8')
def scrape_bill_document_from_original_source(filePath): filePath = filePath.strip() outFilePath = "/".join(filePath.split("/")[7:]) outFilePath = re.sub("\s+", "_", outFilePath) outDirRootPath = "/mnt/data/sunlight/dssg/scraped_bills_new" outFileName = "{0}/{1}.json".format(outDirRootPath, outFilePath) billFile = codecs.open(filePath, encoding="utf8").read() billJson = json.loads(billFile) # filters documents that are resolutions bill_text_count = [1 for x in billJson['type'] if "bill" in x.lower()] if sum(bill_text_count) < 1: return # filter versions to be only the first and last billJson['versions'] = [billJson['versions'][0], billJson['versions'][-1]] urls = [x['url'] for x in billJson['versions']] for i, url in enumerate(urls): billDocument = utils.fetch_url(url) if billDocument is not None: billDocument = base64.b64encode(billDocument) else: logging.error("file {0}, url {1}, version {2}, error: << {3} >>".format(filePath, url, i, "link error")) billJson['versions'][i]['bill_document'] = billDocument if not os.path.exists(os.path.dirname(outFileName)): os.makedirs(os.path.dirname(outFileName)) with codecs.open(outFileName, "w", encoding="utf8") as f: f.write(json.dumps(billJson)) logging.info("successfully scraped bill: {0}".format(outFilePath)) return
def leader_board_sample(self, limit=50): """ I have used more than one method to get the initial list of character Ids. The first version simply used the id's of characters I was knew of. This new version is a bit less biased It gathers the players who were in the top limit places on the current leader-board for all areas of the leader-board available. Note that all leader-board stats are strongly correlated. """ seed_ids = [] for leaderboard_type in ["Kills", "Time", "Deaths", "Score"]: fetch_logger().info(f"Fetching {leaderboard_type} {limit}") url = f"http://census.daybreakgames.com/s:{SERVICE_ID}/get/" \ f"{self.namespace}/leaderboard/?name={leaderboard_type}" \ f"&period=Weekly&world={self.server_id}&c:limit={limit}" fetch_logger().info(url) decoded = fetch_url(url) try: decoded_leaderboard = decoded["leaderboard_list"] except Exception as err: # fetch_logger().error(decoded) fetch_logger().error(url) fetch_logger().error(f"Failed with {err}") raise err for characters in decoded_leaderboard: character_id = characters.get("character_id") if character_id is not None: seed_ids.append(character_id) unique = list(set(seed_ids)) # Record the starting nodes for debugging. The busy_timeout prevents # a issue where sqlite3 was not waiting long enough. # It probably isn't needed but.... self.archive_connection.execute("PRAGMA busy_timeout = 30000") self.archive_connection.execute( "INSERT INTO seed_nodes (name,seed_nodes) VALUES(?,?)", (self.table_name, ",".join(unique)), ) return seed_ids
def _fetch_pcsapi(self, path, params=None, data=None, headers={}): assert self._bduss is not None url = urljoin(self._pcsapi_baseUrl, path) + "?app_id=266719" if params: url += "&" + urlencode(params) headers["Cookie"] = "BDUSS=" + self._bduss try: r = fetch_url(url, data, headers) except urllib2.HTTPError as e: try: error_content = e.read() if e.headers.get("content-encoding") == "gzip": error_content = unzip(error_content) eo = json.loads(error_content) except: raise e else: raise PCSApiError(eo.get("error_code"), eo.get("error_msg")) return json.loads(r)
def find_article(url, end_article): json_file = fetch_url(url) link_dictionary = json.load(json_file) # Handle the varying pageid key in the JSON file provided by MediaWiki pageid = list(link_dictionary['query']['pages'])[0] # Page does not exist if pageid == '-1': return False article_entry = {'ns': 0, 'title': end_article} if article_entry in link_dictionary['query']['pages'][pageid]['links']: return True # Continue if link list is not complete yet if 'query-continue' in list(link_dictionary): continue_string = urllib2.quote(link_dictionary['query-continue']['links']['plcontinue']) ''' Endless appending of plcontinues, can be improved ''' new_url = url + '&plcontinue=' + continue_string return find_article(new_url, end_article) else: return False
def _fetch_pcsapi(self, path, params=None, data=None, headers={}): assert self._bduss is not None url = urljoin(self._pcsapi_baseUrl, path) + '?app_id=266719' if params: url += '&' + urlencode(params) headers['Cookie'] = 'BDUSS=' + self._bduss try: r = fetch_url(url, data, headers) except urllib2.HTTPError as e: try: error_content = e.read() if e.headers.get('content-encoding') == 'gzip': error_content = unzip(error_content) eo = json.loads(error_content) except: raise e else: raise PCSApiError(eo.get('error_code'), eo.get('error_msg')) return json.loads(r)
def _fetch_pcsapi(self, path, params=None, data=None, headers={}): assert self._bduss is not None url = urljoin(self._pcsapi_baseUrl, path) + '?app_id=266719' if params: url += '&' + urlencode(params) headers['Cookie'] = 'BDUSS=' + self._bduss try: r = fetch_url(url, data, headers) except urllib2.HTTPError as e: try: error_content = e.read() if e.headers.get('content-encoding') == 'gzip': error_content = unzip(error_content) eo = json.loads(error_content) except: raise e else: raise PCSApiError(eo.get('error_code'), eo.get('error_msg')) return json.loads(r)
def _fetch_pcsapi(self, path, params=None, data=None, headers={}): url = urljoin(self._pcsapi_baseUrl, path) + '?' if params: url += urlencode(params) + '&' url += 'access_token=' + self._access_token try: r = fetch_url(url, data, headers) except urllib2.HTTPError as e: try: error_content = e.read() if e.headers.get('content-encoding') == 'gzip': error_content = gzip.GzipFile(fileobj=StringIO( error_content), mode='rb').read() eo = json.loads(error_content) except: raise e else: raise PCSApiError(eo.get('error_code'), eo.get('error_msg')) return json.loads(r)
def login(self): page = fetch_url(self.LOGIN_URL) if self.logged_in(page): return True else: forms = ParseResponse(mechanize.urlopen(self.LOGIN_URL), backwards_compat=False) form = forms[1] form['username'] = self.username form['password'] = getpass() form.find_control("cookietime").items[0].selected = True request = form.click() try: response = mechanize.urlopen(request) except mechanize.HTTPError, response2: exit('HTTP error while logging in.') content = response.read() if self.logged_in(content): return True else: return False
def setup(self): if os.path.isfile(self.__filename): return True else: return utils.fetch_url(self.__url, self.__filename)
def _download_img(self): imgdata = utils.fetch_url(self._imgurl, timeout=10) xbmcvfs.File(self._tmp_imgfile, 'w').write(imgdata)
def run_command(message_data): sender = message_data['sender'] said = message_data['said'] # '#channel' if room, 'sender' if private message current_channel = message_data['current_channel'] params = message_data['params'] # Get title from web pages if 'http://' in said: url = extract_url(said) title = get_title(url) if title: say(current_channel, 'Title: %s' % title) # Get link to Wikipedia article if '[[' in said: for article_name in extract_article(said): say(current_channel, get_link(article_name)) # Reply to mention with a random quote if nickname in said: say(current_channel, random_quote(sender)) ## IRC commands ## search_term = '+'.join(params) # List all commands if said.find('@help') == 0: say(sender, 'Search engines: google, wa, ddg, drae, dpd, en, es') say(sender, 'Misc: random [list], conv (unit) to (unit), fetch (wikipedia_article), link <start|get|check|stop>, calc (expression)') # Google elif said.find('@google') == 0: say(current_channel, 'https://www.google.com/search?q=%s' % search_term) # Wolfram Alpha elif said.find('@wa') == 0: say(current_channel, 'http://www.wolframalpha.com/input/?i=%s' % search_term) # DuckDuckGo elif said.find('@ddg') == 0: say(current_channel, 'http://duckduckgo.com/?q=%s' % search_term) # DRAE elif said.find('@drae') == 0: say(current_channel, 'http://lema.rae.es/drae/?val=%s' % search_term) # DPD elif said.find('@dpd') == 0: say(current_channel, 'http://lema.rae.es/dpd/?key=%s' % search_term) # Jisho kanji lookup elif said.find('@kan') == 0: escaped_term = urllib2.quote(search_term) say(current_channel, 'http://jisho.org/kanji/details/%s' % escaped_term) # EN > JP elif said.find('@ei') == 0: say(current_channel, 'http://jisho.org/words?jap=&eng=%s&dict=edict' % search_term) # JP > EN elif said.find('@ni') == 0: escaped_term = urllib2.quote(search_term) say(current_channel, 'http://jisho.org/words?jap=%s&eng=&dict=edict' % escaped_term) # EN > ES elif said.find('@en') == 0: say(current_channel, 'http://www.wordreference.com/es/translation.asp?tranword=%s' % search_term) # ES > EN elif said.find('@es') == 0: say(current_channel, 'http://www.wordreference.com/es/en/translation.asp?spen=%s' % search_term) # Random choice elif said.find('@random') == 0: if len(params) == 1: say(current_channel, 'f****t') elif len(params) > 1: say(current_channel, random.choice(said.split(',').strip())) else: say(current_channel, random.choice([0, 1])) # Unit converter elif said.find('@conv') == 0: if 'to' not in params: return index = params.index('to') amount = params[0] unit_from = params[1:index] unit_from = urllib2.quote(' '.join(unit_from)) # 'to' == params[index] unit_to = params[index + 1:] unit_to = urllib2.quote(' '.join(unit_to)) conversion_url = 'http://www.google.com/ig/calculator?hl=en&q=' conversion = fetch_url(conversion_url + amount + unit_from + '=?' + unit_to).read() parsed_conversion = conversion.split('"') # Check for errors if len(parsed_conversion[5]) == 0: unit_result = urllib2.unquote(unit_to) say(current_channel, '%s %s' % (parsed_conversion[3].split()[0], unit_result)) # Linkrace module elif said.find('@link') == 0: # Get race links if params[0] == 'get': url = 'http://es.wikipedia.org/wiki/%s' start, end = random_pair() starturl = url % urllib2.quote(start) endurl = url % urllib2.quote(end) say(current_channel, 'Start article is %s' % starturl) say(current_channel, 'Goal article is %s' % endurl) # Check if chain is valid elif params[0] == 'check': chain = ' '.join(params[1:]) broken_links = check_chain(chain) if not broken_links: say(current_channel, 'The chain is valid.') else: error_list = ' | '.join(broken_links) say(current_channel, error_list) say(current_channel, 'The chain is not valid.') # Calculator elif said.find('@calc') == 0: expression = ''.join(params) result = str(calculate(expression)) say(current_channel, result) # Wikipedia fetch elif said.find('@fetch') == 0: article_name = ' '.join(params) extract = fetch(article_name) say(current_channel, extract) # Text game elif said.find('@dicks') == 0: global game # Commands available for everyone if params[0] == 'join': game.join_game(sender) elif params[0] == 'players': say(current_channel, [player.name for player in game.players]) # Commands available for players if sender in [player.name for player in game.players]: if params[0] == 'panel': panel_url = sprunge(game.panel(sender)) say(sender, '[i] Uploading panel') say(sender, panel_url) elif params[0] == 'settle': group = params[1] game.settle(sender, group) elif params[0] == 'move': troop = params[1] new_position = [params[2], params[3]] game.move(sender, troop, new_position) ## Owner commands ## if sender == owner: # Disconnect if said == '.quit': execute('QUIT') sys.exit(0) # Send message from bot elif said.find('.say') == 0: if len(params) > 1: say(params[0], ' '.join(params[1:])) # Print userlist elif said.find('.users') == 0: say(current_channel, str(users)) # Bot joins elif said.find('.join') == 0: channel = params[0] execute('JOIN %s' % channel) # Bot parts elif said.find('.part') == 0: execute('PART %s' % current_channel) del users[current_channel] # Bot kicks elif said.find('.kick') == 0: user = params[0] reason = ' '.join(params[1:]) if not reason: reason = 'huh' bot_kick(current_channel, user, reason)
def get_metrics(start, end, author = None, config = None): start = datetime.strptime(start, "%Y-%m-%d %H:%M:%S") end = datetime.strptime(end, "%Y-%m-%d %H:%M:%S") metrics = { 'commits' : 0, 'git_push' : 0, 'issues' : { 'opened' : 0, 'closed' : 0, }, 'merge_requests' : { 'opened' : 0, 'closed' : 0, }, } if config is None: config = get_config(os.environ['HOME'] + '/.qe-metrics/gitlab.conf') url = config['url'] user = config['username'] token = config['private_token'] if not author: author = user ##### NEW issues opened during period atom_url = '%s/u/%s.atom?private_token=%s' % (url, author, token) dom = parseString(fetch_url(atom_url)) for entry in dom.getElementsByTagName("entry"): title = entry.getElementsByTagName("title")[0].firstChild.wholeText updated_on = entry.getElementsByTagName("updated")[0].firstChild.wholeText updated_on = datetime.strptime(updated_on, "%Y-%m-%dT%H:%M:%SZ") # skip older or newer events if not (start <= updated_on <= end): continue if title.find('pushed') > -1: metrics['git_push'] += 1 # commits is a bit wrong. # when pushing to a new branch for the first time # GitLab reports commits from other users as well, # for example when merging to the latest upstream # this can be fixed by a second parameter, the user real name # as it appears in the commits, but it doesn't work # nicely with --author and isn't that important for now! for summary in entry.getElementsByTagName("summary"): for a in summary.getElementsByTagName("a"): href = a.getAttribute('href') if href.find('/commit/') > -1: metrics['commits'] += 1 elif title.find('opened issue') > -1: metrics['issues']['opened'] += 1 elif title.find('closed issue') > -1: metrics['issues']['closed'] += 1 elif title.find('opened MR') > -1: metrics['merge_requests']['opened'] += 1 elif title.find('accepted MR') > -1: metrics['merge_requests']['closed'] += 1 return metrics