def parse_item(self, response): hxs = Selector(response) item_titles = extract(hxs, "//div[@class='gl-i-wrap j-sku-item']//a/em/text()") top_id = extract_one(hxs, '//*[@id="J_crumbsBar"]/div/div/div/div[1]/a/text()') type_id1 = extract(hxs, '//*[@id="J_crumbsBar"]//div[@class="trigger"]/span/text()')[0] type_id2 = extract(hxs, '//*[@id="J_crumbsBar"]//div[@class="trigger"]/span/text()')[-1] if type_id1 != type_id2: for i, t in enumerate(item_titles): if i < 20: good = { 'mall': '2', 'rank': str(i + 1), 'title': t, 'price': '0', 'turnover_index': '0', 'top_id': top_id, 'type_id1': type_id1, 'type_id2': type_id2, 'url': response.url } yield Good(good) for link in self.normal_url_extractor.extract_links(response): yield SplashRequest(link.url, callback=self.parse_url, args={'wait': 0.5, 'html': 1, }) for link in self.needed_url_extractor.extract_links(response): if 'ev' not in link.url: url = re.sub(r'page=.*&', 'page=1&', link.url) url = re.sub(r'stock=.*&', 'stock=0&', url) url = re.sub(r'delivery_daofu=.*&', 'delivery_daofu=0&', url) url = re.sub(r'delivery=.*&', 'delivery=0&', url) yield SplashRequest(url, callback=self.parse_item, args={'wait': 0.5, 'html': 1, })
def parse_item(self, response): hxs = Selector(response) top_id = re.findall(r'.*&topId=(\S+_\S+)&type.*', response.url)[0] # type_id=re.findall(r'.*leafId=(\d+)&rank=.*',response.url)[0] type_id1 = extract_one( hxs, "//div[@class='block-body ']/div[@class='params-cont']/a[@class='param-item icon-tag param-item-selected']/text()" ) ranks_tuple = extract( hxs, '//*[@class="rank-num rank-focus"]/text()|//*[@class="rank-num rank-important"]/text()|//*[@class="rank-num rank-"]/text()' ) ranks = [] for r in ranks_tuple: if r.strip() != '': ranks.append(r) titles = extract(hxs, '//*[@class="title"]/a/text()') prices = extract(hxs, '//*[@class="col3 col"]/text()')[1:] turnover_indexs = extract(hxs, '//*[@class="focus-bar"]/span/text()') for r, t, p, i in zip(ranks, titles, prices, turnover_indexs): good = { 'mall': '0', 'rank': r.strip(), 'title': t.strip(), 'price': p.split('¥')[-1].strip(), 'turnover_index': i.strip(), 'top_id': top_id.strip(), 'type_id1': type_id1.strip(), 'type_id2': '', 'url': response.url } yield Good(good)
def parse_item(self,response): hxs=Selector(response) top_id=re.findall(r'.*&topId=(\S+_\S+)&type.*',response.url)[0] # type_id=re.findall(r'.*leafId=(\d+)&rank=.*',response.url)[0] type_id1=extract_one(hxs,"//div[@class='block-body ']/div[@class='params-cont']/a[@class='param-item icon-tag param-item-selected']/text()") ranks_tuple=extract(hxs,'//*[@class="rank-num rank-focus"]/text()|//*[@class="rank-num rank-important"]/text()|//*[@class="rank-num rank-"]/text()') ranks=[] for r in ranks_tuple: if r.strip()!='': ranks.append(r) titles=extract(hxs,'//*[@class="title"]/a/text()') prices=extract(hxs,'//*[@class="col3 col"]/text()')[1:] turnover_indexs=extract(hxs,'//*[@class="focus-bar"]/span/text()') for r,t,p,i in zip(ranks,titles,prices,turnover_indexs): good={ 'mall':'0', 'rank':r.strip(), 'title':t.strip(), 'price':p.split('¥')[-1].strip(), 'turnover_index':i.strip(), 'top_id':top_id.strip(), 'type_id1':type_id1.strip(), 'type_id2':'', 'url':response.url } yield Good(good)
def load_lists(opt): arch_name = 'ava_v{}.zip'.format(opt.version) arch_path = os.path.join(opt.out_path, arch_name) arch_url = 'https://research.google.com/ava/download/{}'.format(arch_name) if utils.download_file(arch_url, arch_path): utils.extract(arch_path, opt.out_path) train_video_ids, val_video_ids, test_video_ids = None, None, None if opt.type is None or opt.type == 'train': ids_file_path = os.path.join(opt.out_path, 'ava_train_v{}.csv'.format(opt.version)) train_video_ids = read_ids(ids_file_path) if opt.type is None or opt.type == 'validation': ids_file_path = os.path.join(opt.out_path, 'ava_val_v{}.csv'.format(opt.version)) val_video_ids = read_ids(ids_file_path) if opt.type is None or opt.type == 'test': ids_file_path = os.path.join(opt.out_path, 'ava_test_v{}.txt'.format(opt.version)) test_video_ids = read_ids(ids_file_path) ts_file_name = 'ava_included_timestamps_v{}.txt'.format(opt.version) ts_file_path = os.path.join(opt.out_path, ts_file_name) with open(ts_file_path) as f: lines = f.readlines() timestamps = int(lines[0]), int(lines[-1]) return train_video_ids, val_video_ids, test_video_ids, timestamps
def parse_item(self, response): hxs = Selector(response) item_titles = extract(hxs, "//div[@class='gl-i-wrap j-sku-item']//a/em/text()") top_id = extract_one(hxs, '//*[@id="J_crumbsBar"]/div/div/div/div[1]/a/text()') type_id1 = extract(hxs, '//*[@id="J_crumbsBar"]//div[@class="trigger"]/span/text()')[0] type_id2 = extract(hxs, '//*[@id="J_crumbsBar"]//div[@class="trigger"]/span/text()')[-1] if type_id1 != type_id2: for i, t in enumerate(item_titles): if i < 20: good = { "mall": "2", "rank": str(i + 1), "title": t, "price": "0", "turnover_index": "0", "top_id": top_id, "type_id1": type_id1, "type_id2": type_id2, "url": response.url, } yield Good(good) for link in self.normal_url_extractor.extract_links(response): yield SplashRequest(link.url, callback=self.parse_url, args={"wait": 0.5, "html": 1}) for link in self.needed_url_extractor.extract_links(response): if "ev" not in link.url: url = re.sub(r"page=.*&", "page=1&", link.url) url = re.sub(r"stock=.*&", "stock=0&", url) url = re.sub(r"delivery_daofu=.*&", "delivery_daofu=0&", url) url = re.sub(r"delivery=.*&", "delivery=0&", url) yield SplashRequest(url, callback=self.parse_item, args={"wait": 0.5, "html": 1})
def preprocess(no_wells_marmousi, no_wells_seam): """Function initializes data, performs standardization, and train test split Parameters: ---------- no_wells_marmousi : int, number of evenly spaced wells and seismic samples to be evenly sampled from marmousi section. no_wells_seam : int number of evenly spaced wells and seismic samples to be evenly sampled from SEAM Returns ------- seismic_marmousi : array_like, shape(num_traces, depth samples) 2-D array containing seismic section for marmousi seismic_seam : array_like, shape(num_traces, depth samples) 2-D array containing seismic section for SEAM model_marmousi : array_like, shape(num_wells, depth samples) 2-D array containing model section from marmousi 2 model_seam : array_like, shape(num_wells, depth samples) 2-D array containing model section from SEAM """ # get project root directory project_root = os.getcwd() if ~os.path.isdir( 'data'): # if data directory does not exists then extract extract('data.zip', project_root) # Load data seismic_marmousi = np.load(join( 'data', 'marmousi_synthetic_seismic.npy')).squeeze() seismic_seam = np.load(join('data', 'poststack_seam_seismic.npy')).squeeze()[:, 50:] seismic_seam = seismic_seam[::2, :] # Load targets and standardize data model_marmousi = np.load(join('data', 'marmousi_Ip_model.npy')).squeeze()[::5, ::4] model_seam = np.load(join('data', 'seam_elastic_model.npy'))[::3, :, ::2][:, :, 50:] model_seam = model_seam[:, 0, :] * model_seam[:, 2, :] # standardize seismic_marmousi, model_marmousi = standardize(seismic_marmousi, model_marmousi, no_wells_marmousi) seismic_seam, model_seam = standardize(seismic_seam, model_seam, no_wells_seam) return seismic_marmousi, seismic_seam, model_marmousi, model_seam
def install(): fetch("http://ftp.gnome.org/pub/gnome/sources/json-glib/0.16/json-glib-%(json-glib)s.tar.xz") extract("json-glib-%(json-glib)s.tar.xz") configure( "json-glib-%(json-glib)s", ["--prefix=%s" % env.prefix, "--disable-gcov", "--disable-introspection", "CC=clang"] ) make("json-glib-%(json-glib)s") make("json-glib-%(json-glib)s", "install")
def test_extract(self): #empty output directory utils.init_path(self.output_dir) utils.extract(self.archive_file, self.output_dir) files = os.listdir(self.output_dir) with tarfile.open(self.archive_file) as f: for file in files: assert file in f.getnames()
async def autoip(self, ctx, *, options: str = None): options = options.split(" ") if options else [] version = extract(options, "46") addr_class = extract(options, "abc", func="lower") if not version or version != "6": func = self.fake.ipv4(address_class=addr_class) else: func = self.fake.ipv6() return await self.send(ctx, "IP address", func)
def download_libs(constants: Constants, settings: {}, progress: sg.ProgressBar): archive = os.path.join(settings["installdir"], "libraries.zip") utils.download_file(constants.win64_dev_libs, archive, 2492854) utils.extract(archive, settings["installdir"]) dlldir = os.path.join(settings["installdir"], "dev64", "bin") for lib in os.listdir(dlldir): shutil.copyfile(os.path.join(dlldir, lib), os.path.join(settings["installdir"], lib))
def incorrectly_ordered_boolean(if_ast: dict, code, code_identifier_lst): if if_ast["test"]["type"] == "LogicalExpression" and if_ast["test"][ "operator"] == "&&": code_left = utils.extract(if_ast["test"]["left"]["loc"], code) code_right = utils.extract(if_ast["test"]["right"]["loc"], code) if code_left in code_right: # TODO similarity tmp = if_ast["test"]["left"] if_ast["test"]["left"] = if_ast["test"]["right"] if_ast["test"]["right"] = tmp return True
def parse(self, response): xxs= scrapy.Selector(response) source="baidu_"+extract(xxs.xpath('//channel/title/text()')) for xItem in xxs.xpath('//item'): item=Article() item['source']=source item['title']=extract( xItem.xpath('./title/text()')) item['link']= extract(xItem.xpath('./link/text()')) item['desc']= extract(xItem.xpath('./description/text()')) item['pubDate']= extract(xItem.xpath('./pubDate/text()')) yield item
def read_dataframe(out_path, split): arch_url = KIN_PARAMS[split]['arch_url'] arch_path = os.path.join(out_path, KIN_PARAMS[split]['arch_name']) csv_name = KIN_PARAMS[split]['csv_name'] csv_path = os.path.join(out_path, csv_name) if utils.download_file(arch_url, arch_path): utils.extract(arch_path, out_path, csv_name) df = pd.read_csv(csv_path) if 'label' in df.columns: df['label'] = df['label'].astype('category') return df
async def autocolor(self, ctx, *, options: str = None): options = options.split(" ") if options else [] color_format = extract(options, COLOR_FORMATS, func="lower", default="hex") hue = extract(options, HUES, func="lower") luminosity = extract(options, LUMINOSITIES, func="lower", default="random") return await self.send( ctx, f"{color_format} color", self.fake.color( hue=hue, luminosity=luminosity, color_format=color_format ) )
def _get_toolchain(tree, root, keep_archive=False, clean=False): archive = _create_name(tree, suffix=".tar.bz2") tree.insert(0, root) dir = path.join(_create_path(tree)) archive_dir = path.join(dir, archive) if not check_path(dir, clean): if download(TOOLCHAIN_FORGE + archive, archive_dir): extract(archive_dir, dir) if not keep_archive: remove(archive_dir) else: print "! %s already exists" % dir
def install(): fetch('http://www.pell.portland.or.us/~orc/Code/discount/discount-%(discount)s.tar.bz2') extract('discount-%(discount)s.tar.bz2') configure('discount-%(discount)s', ['--prefix=%s' % env.prefix, '--libdir=%s/lib' % env.prefix, '--mandir=%s/man' % env.prefix, '--shared', '--enable-all-features'], 'configure.sh') run('sed -i .bkp -e "/ldconfig/d" %s/%s/librarian.sh' % (env.build, 'discount-%(discount)s' % env.versions)) make('discount-%(discount)s') make('discount-%(discount)s', 'install')
def video_info(filepath): """Return some video meta information as a dictionary.""" ffmpeg = subprocess.Popen("ffmpeg -i " + filepath, shell=True, stderr=subprocess.PIPE) stdout, stderr = ffmpeg.communicate() info = {} info['creation'] = extract('creation_time[ ]+: ([0-9-]* [0-9:]*)', stderr, 1, timestamp) info['duration'] = extract('Duration: ([0-9:\.]*)', stderr, 1) info['detected fps'] = extract('([0-9]*.?[0-9]*) fps,', stderr, 1, float) info['w'], info['h'] = extract('Stream.*, ([0-9]+)x([0-9]+)', stderr, (1, 2), lambda (x, y): (int(x), int(y))) return info
def parse(self, response): hxs = Selector(response, type="html") item_url_list = extract(hxs, "//div[@class='block-body ']/div[@class='params-cont']/a/@href") # //div[@class='block-body ']/div[@class='params-cont']/a/@href for url in item_url_list: url = url.replace('./index.php?', 'https://top.taobao.com/index.php?') yield SplashRequest(url, callback=self.extract_url, args={'wait': 0.5, 'html': 1})
def parse(self,response): hxs=Selector(response,type="html") item_url_list=extract(hxs,"//div[@class='block-body ']/div[@class='params-cont']/a/@href") # //div[@class='block-body ']/div[@class='params-cont']/a/@href for url in item_url_list: url=url.replace('./index.php?','https://top.taobao.com/index.php?') yield SplashRequest(url,callback=self.extract_url,args={'wait':0.5,'html':1})
def search(cls, query_params): # NOTE: Params 'recursive' and 'with_responses' are currently not used by # either the 'search' or 'get_all' actions below. Both already use # with_responses=False internally in the comment service, so no additional # optimization is required. params = { 'page': 1, 'per_page': 20, 'course_id': query_params['course_id'], } params.update(utils.strip_blank(utils.strip_none(query_params))) if query_params.get('text'): url = cls.url(action='search') else: url = cls.url(action='get_all', params=utils.extract(params, 'commentable_id')) if params.get('commentable_id'): del params['commentable_id'] response = utils.perform_request( 'get', url, params, metric_tags=[u'course_id:{}'.format(query_params['course_id'])], metric_action='thread.search', paged_results=True) if query_params.get('text'): search_query = query_params['text'] course_id = query_params['course_id'] group_id = query_params[ 'group_id'] if 'group_id' in query_params else None requested_page = params['page'] total_results = response.get('total_results') corrected_text = response.get('corrected_text') # Record search result metric to allow search quality analysis. # course_id is already included in the context for the event tracker tracker.emit( 'edx.forum.searched', { 'query': search_query, 'corrected_text': corrected_text, 'group_id': group_id, 'page': requested_page, 'total_results': total_results, }) log.info( u'forum_text_search query="{search_query}" corrected_text="{corrected_text}" course_id={course_id} group_id={group_id} page={requested_page} total_results={total_results}' .format(search_query=search_query, corrected_text=corrected_text, course_id=course_id, group_id=group_id, requested_page=requested_page, total_results=total_results)) return utils.CommentClientPaginatedResult( collection=response.get('collection', []), page=response.get('page', 1), num_pages=response.get('num_pages', 1), thread_count=response.get('thread_count', 0), corrected_text=response.get('corrected_text', None))
def parse_shebang(s: str) -> str: """repos extract token from shebang like `#!/bin/sh` https://en.wikipedia.org/wiki/Shebang_(Unix) :param s: shebang :return: shebang token """ script = s try: match = extract(s, REGEX_SHEBANG_FULL) script = match.group().split('/')[-1] pos = match.end() match = extract(s, REGEX_SHEBANG_WHITESPACE, pos=pos) pos = match.end() match = extract(s, REGEX_SHEBANG_NON_WHITESPACE, pos=pos) return extract(match.group(), compile(r'[^\d]+')).group(0) except ExtractException as e: return script
def __init__(self, bug, hash): """ Initialize comments :arg hash: Dictionary of comment details :arg bug: Instance of :class:`~bz_xmlrpc.classes.Bug` object :return: Instance of :class:`Comment` .. note:: No need to use this directly. Use :meth:`~bz_xmlrpc.classes.Bug.get_comments()` """ self._hash = hash self.id = extract(hash, 'id', 'comment_id') self.author = extract(hash, 'email', 'author') self.bug = bug self.is_private = bool(extract(hash, 'is_private', 'isprivate')) self.text = extract(hash, 'text', 'body') self.time = to_datetime(extract(hash, 'time', 'bug_when'))
def __init__(self, bug, hash): """ Initialize comments :arg hash: Dictionary of comment details :arg bug: Instance of :class:`~bz_xmlrpc.classes.Bug` object :return: Instance of :class:`Comment` .. note:: No need to use this directly. Use :meth:`~bz_xmlrpc.classes.Bug.get_comments()` """ self._hash = hash self.id = extract(hash, "id", "comment_id") self.author = extract(hash, "email", "author") self.bug = bug self.is_private = bool(extract(hash, "is_private", "isprivate")) self.text = extract(hash, "text", "body") self.time = to_datetime(extract(hash, "time", "bug_when"))
def do_scatter(i, j, ax): """ Draw single scatter plot """ xs, ys = utils.extract(i, j, steadies) ax.scatter(xs, ys) ax.set_xlabel(r"$S_%d$" % i) ax.set_ylabel(r"$S_%d$" % j) cc = utils.get_correlation(xs, ys) ax.set_title(r"Corr: $%.2f$" % cc)
def __init__(self, hash): """ Initialize """ self._hash = hash if isinstance(hash, str): # Hack for searched bug groups self.name = hash self.ison = True else: self.bit = extract(hash, 'bit', 'id') self.name = extract(hash, 'name') self.description = extract(hash, 'description') self.ingroup = bool(extract(hash, 'ingroup')) self.ison = bool(extract(hash, 'ison')) self.mandatory = bool(extract(hash, 'mandatory')) self.othercontrol = bool(extract(hash, 'othercontrol')) self.direct = bool(extract(hash, 'direct')) self.isbuggroup = bool(extract(hash, 'isbuggroup')) self.userregexp = extract(hash, 'userregexp')
def do_scatter(i, j, ax): """ Draw single scatter plot """ xs, ys = utils.extract(i, j, steadies) ax.scatter(xs, ys) ax.set_xlabel(r'$S_%d$' % i) ax.set_ylabel(r'$S_%d$' % j) cc = utils.get_correlation(xs, ys) ax.set_title(r'Corr: $%.2f$' % cc)
def __init__(self, hash): """ Initialize """ self._hash = hash if isinstance(hash, str): # Hack for searched bug groups self.name = hash self.ison = True else: self.bit = extract(hash, "bit", "id") self.name = extract(hash, "name") self.description = extract(hash, "description") self.ingroup = bool(extract(hash, "ingroup")) self.ison = bool(extract(hash, "ison")) self.mandatory = bool(extract(hash, "mandatory")) self.othercontrol = bool(extract(hash, "othercontrol")) self.direct = bool(extract(hash, "direct")) self.isbuggroup = bool(extract(hash, "isbuggroup")) self.userregexp = extract(hash, "userregexp")
def parse_item(self,response): hxs=Selector(response) item_titles=extract(hxs,"//div[@id='J_ItemList']//p[@class='productTitle']/a/text()") top_id=extract_one(hxs,'//*[@id="J_CrumbSlideCon"]/li[2]/a/text()') type_id1=extract(hxs,'//*[@id="J_CrumbSlideCon"]//div[@class="crumbDrop j_CrumbDrop"]/a/text()') if type_id1 is not None: if len(type_id1) >1: type_id2=type_id1.split('/n')[-1] else: type_id2='' type_id1=type_id1.split('/n')[0] titles=[] title='' for t in item_titles: if not t.endswith('\n'): title+=t.strip() elif t.endswith('\n'): title+=t.strip() if len(title)>5: titles.append(title.strip()) title='' if len(titles)>19: for i,t in enumerate(titles): if i<20: good={ 'mall': '1', 'rank': str(i+1), 'title': t.strip(), 'price': '0', 'turnover_index':'0', 'top_id': top_id.strip(), 'type_id1': type_id1.strip(), 'type_id2': type_id2.strip(), 'url': response.url } yield Good(good) for link in self.normal_url_extractor.extract_links(response): yield SplashRequest(link.url,callback=self.parse,args={'wait':0.5,'html':1,})
def parse_item(self, response): hxs = Selector(response) item_titles = extract(hxs, "//div[@id='J_ItemList']//p[@class='productTitle']/a/text()") top_id = extract_one(hxs, '//*[@id="J_CrumbSlideCon"]/li[2]/a/text()') type_id1 = extract(hxs, '//*[@id="J_CrumbSlideCon"]//div[@class="crumbDrop j_CrumbDrop"]/a/text()') if type_id1 is not None: if len(type_id1) > 1: type_id2 = type_id1.split('/n')[-1] else: type_id2 = '' type_id1 = type_id1.split('/n')[0] titles = [] title = '' for t in item_titles: if not t.endswith('\n'): title += t.strip() elif t.endswith('\n'): title += t.strip() if len(title) > 5: titles.append(title.strip()) title = '' if len(titles) > 19: for i, t in enumerate(titles): if i < 20: good = { 'mall': '1', 'rank': str(i + 1), 'title': t.strip(), 'price': '0', 'turnover_index': '0', 'top_id': top_id.strip(), 'type_id1': type_id1.strip(), 'type_id2': type_id2.strip(), 'url': response.url } yield Good(good) for link in self.normal_url_extractor.extract_links(response): yield SplashRequest(link.url, callback=self.parse, args={'wait': 0.5, 'html': 1, })
def preprocess(no_wells): """Function initializes data, performs standardization, and train test split Parameters: ---------- no_wells : int, number of evenly spaced wells and seismic samples to be evenly sampled from seismic section. Returns ------- seismic : array_like, shape(num_traces, depth samples) 2-D array containing seismic section model : array_like, shape(num_wells, depth samples) 2-D array containing model section """ # get project root directory project_root = os.getcwd() if ~os.path.isdir( 'data'): # if data directory does not exists then extract extract('data.zip', project_root) # Load data seismic = np.load(join('data', 'poststack_seam_seismic.npy')).squeeze()[:, 50:] seismic = seismic[::2, :] # Load targets and standardize data model = np.load(join('data', 'seam_elastic_model.npy'))[::3, :, ::2][:, :, 50:] model = model[:, 0, :] * model[:, 2, :] # standardize seismic, model = standardize(seismic, model, no_wells) return seismic, model
def criterion(output, target): prop, box = output target_instance = extract(target.cpu(), box, resize) loss, ok = 0., False for i, img in enumerate(target_instance): if prop[i] is not None: z_target = ab2z(img) loss += MCE(prop[i].cpu(), z_target, weights=w[z_target.argmax(dim=-1)]).mean() ok = True if not ok: loss = torch.tensor(0., requires_grad=True) return loss
def __init__(self, mapp, img, K): self.K = K self.Kinv = np.linalg.inv(self.K) self.pose = np.eye(4) self.h, self.w = img.shape[0:2] self.kpus, self.des = extract(img) self.kps = normalize(self.Kinv, self.kpus) self.pts = [None] * len(self.kps) self.id = len(mapp.frames) mapp.frames.append(self)
def fetch(self): """Download and extract the dataset.""" home = self.home() if not path.exists(home): os.makedirs(home) # download archives archive_filenames = [] for key, archive in self.ARCHIVES.iteritems(): url = archive['url'] sha1 = archive['sha1'] basename = path.basename(url) archive_filename = path.join(home, basename) if not path.exists(archive_filename): download(url, archive_filename, sha1=sha1) archive_filenames += [(archive_filename, sha1)] self.ARCHIVES[key]['archive_filename'] = archive_filename # extract them for name, archive in self.ARCHIVES.iteritems(): archive_dir = path.join(home, name) if os.path.exists(archive_dir): continue url = archive['url'] sha1 = archive['sha1'] archive_filename = archive['archive_filename'] extract(archive_filename, home, sha1=sha1, verbose=True) # move around stuff if needed if 'moves' in archive: for move in archive['moves']: src = self.home(move['source']) dst = self.home(move['destination']) # We can't use shutil here since the destination folder # may already exist. Fortunately the distutils can help # us here (see standard library). dir_util.copy_tree(src, dst) dir_util.remove_tree(src)
def fetch(self, download_if_missing=True): """Download and extract the dataset.""" home = self.home() if not download_if_missing: raise IOError("'%s' exists!" % home) # download archive url = self.URL sha1 = self.SHA1 basename = path.basename(url) archive_filename = path.join(home, basename) if not path.exists(archive_filename): if not download_if_missing: return if not path.exists(home): os.makedirs(home) download(url, archive_filename, sha1=sha1) # extract it if not path.exists(self.home(self.SUBDIR)): extract(archive_filename, home, sha1=sha1, verbose=True)
def parse_item(self,response): hxs=Selector(response) search_condition=extract_one(hxs,'//*[@id="J_CrumbSearchInuput"]/@value') item_titles=extract(hxs,"//div[@id='J_ItemList']//p[@class='productTitle']/a/text()") top_id=extract_one(hxs,'//*[@id="J_CrumbSlideCon"]/li[2]/a/text()') type_id1=extract_one(hxs,'//*[@id="J_CrumbSlideCon"]//div[@class="crumbDrop j_CrumbDrop"]/a/text()') if type_id1 is not None and search_condition is not None: type_id1=type_id1.split('/n')[0] titles=[] title='' for t in item_titles: if not t.endswith('\n'): title+=t.strip() elif t.endswith('\n'): title+=t.strip() if len(title)>5: titles.append(title.strip()) title='' if len(titles)>19 and search_condition!=type_id1: for i,t in enumerate(titles): if i<20: good={ 'mall': '1', 'rank': str(i+1), 'title': t.strip(), 'price': '0', 'turnover_index':'0', 'top_id': top_id.strip(), 'type_id1': type_id1.strip(), 'type_id2': search_condition.strip(), 'url': response.url } yield Good(good) for link in self.needed_url_extractor.extract_links(response): if 'industryCatId' and 'cat' in link.url and 'post_fee' and 'brand' not in link.url: url = re.sub(r'sort=.*&', 'sort=d&', link.url) url = re.sub(r'search_condition=.*&', 'search_condition=7', url) url=re.sub(r'miaosha=.*&','miaosha=0&',url) url=re.sub(r'wwonline=.*&','wwonline=0&',url) yield SplashRequest(url, callback=self.parse_item, args={'wait': 0.5, 'html': 1,})
def get_and_run_installer(self, installer): installer_exe = os.path.abspath(os.path.basename(installer)) if not os.path.isfile(installer_exe): print 'Downloading', installer, '..', installer_exe = download(installer) if installer_exe is None: print 'Download FAILED' return False print 'DONE' if os.path.splitext(installer_exe)[-1] in ['.zip']: install_path = self.get_install_path(installer_exe) if install_path is not None: if not os.path.isdir(install_path): os.makedirs(install_path) return bool(extract(installer_exe, install_path)) elif not start_installer(installer_exe): print 'Failed to start', installer_exe return False return True
def get_and_run_installer (self, installer): installer_exe = os.path.abspath(os.path.basename (installer)) if not os.path.isfile (installer_exe): print 'Downloading', installer, '..', installer_exe = download (installer) if installer_exe is None: print 'Download FAILED' return False print 'DONE' if os.path.splitext(installer_exe)[-1] in ['.zip']: install_path = self.get_install_path(installer_exe) if install_path is not None: if not os.path.isdir (install_path): os.makedirs (install_path) return bool(extract(installer_exe, install_path)) elif not start_installer(installer_exe): print 'Failed to start', installer_exe return False return True
def extract_triples(hbt_model, save_weights_path, path, author, subject_model, object_model, tokenizer, id2rel): workbook = xlwt.Workbook(encoding='utf-8') ws = workbook.add_sheet('sheet1', cell_overwrite_ok=True) ws.write(0, 0, "head") ws.write(0, 1, "tail") ws.write(0, 2, "relation") hbt_model.load_weights(save_weights_path) triples = extract(path, subject_model, object_model, tokenizer, id2rel) count = 0 triple_str = "" for triple_list in triples: for triple in triple_list: count += 1 ws.write(count, 0, triple[0]) ws.write(count, 1, triple[1]) ws.write(count, 2, triple[2]) workbook.save(path + author + ".xls")
def __init__(self,**kwargs): """ Initialize a Bugzilla instance. Optional Arguments: ------------------- url : The Bugzilla URL. May or maynot end with /xmlrpc.cgi. If does not end with /xmlrpc.cgi, it will be assumed. If not provided, value of BUGZILLA_URL will be defaulted to. cookie_jar|cookiejar : cookielib.CookieJar/MozillaCookieJar object. user|username|login : Bugzilla login, usually an email id. password|passwd : Password for bugzilla http_proxy|proxy : String specifying the HTTP proxy of the bypass : boolean value, asks client to bypass password auth and use cookies if present client's connection. Usually of the form server:port or http://server:port """ # Initialize public attributes for unlogged unstance self.user_agent = USER_AGENT self.logged_in = False self.user_id = None self._init_private_data() # Extract provided values or default self._cookiejar = extract(kwargs, 'cookie_jar', 'cookiejar') self.url = extract(kwargs, 'url') or BUGZILLA_URL self.user = extract(kwargs, 'user', 'username', 'login') or '' self.password = extract(kwargs, 'password', 'passwd') or '' self.http_proxy = extract(kwargs, 'http_proxy', 'proxy') or '' self.bypass = extract(kwargs, 'bypass') or '' cookie_dir = extract(kwargs, 'cookie_dir') or COOKIE_DIR if not os.path.exists(cookie_dir): os.mkdir(cookie_dir) self.cookiefile = os.path.join(cookie_dir, '%s.cookie' % self.user) self.connect()
def wrong_identifier(if_ast: dict, code, code_identifier_lst): code_condition_padded = utils.extract(if_ast["test"]["loc"], code, padding=5) condition_identifier_lst = [] utils.dict_visitor(if_ast["test"], identifiers=condition_identifier_lst) if len(condition_identifier_lst): identifier_to_augment = random.choice(condition_identifier_lst) # TODO identifier must stand alone? for identifier in code_identifier_lst: identifier_start = identifier["loc"]["start"]["line"] augment_start = identifier_to_augment["loc"]["start"]["line"] if identifier_start < ( augment_start - 5) and identifier["name"] not in code_condition_padded: # TODO choose most similar identifier # TODO near neighborhood could be feasible identifier_to_augment["name"] = identifier["name"] random.shuffle(code_identifier_lst) return True
def predict(self, F, data_info, time_info): ''' This function should provide predictions of labels on (test) data. Make sure that the predicted values are in the correct format for the scoring metric. For example, binary classification problems often expect predictions in the form of a discriminant value (if the area under the ROC curve it the metric) rather that predictions of the class labels themselves. The function predict eventually returns probabilities or continuous values. ''' info_dict = extract(data_info, time_info) print_time_info(info_dict) if params['algo'] == Algo.OLD_CODE: return self.mdl.predict(F, data_info, time_info) elif params['algo'] == Algo.ORIGINAL: return self._original_predict(F, info_dict) elif params['algo'] == Algo.FACEBOOK_LR: return self._facebook_lr_predict(F, info_dict) elif params['algo'] == Algo.BASIC: return self._basic_predict(F, info_dict)
def on_press(self, event): value = self.text_ctrl.GetValue() if not value: print("You didn't enter anything!") else: self.text_ctrl.Hide() png = wx.Image('img/whatever.png', wx.BITMAP_TYPE_ANY).ConvertToBitmap() wx.StaticBitmap(self, -1, png, (0, 0), (png.GetWidth(), png.GetHeight())) if os.path.exists("result.json"): os.remove("result.json") wordlist = utils.extract(value) words = ",".join(wordlist) path = utils.getPath() utils.crawl(words) output = utils.process() utils.writelist(output, path) png = wx.Image('img/finish.png', wx.BITMAP_TYPE_ANY).ConvertToBitmap() wx.StaticBitmap(self, -1, png, (0, 0), (png.GetWidth(), png.GetHeight()))
def get_and_install_source(self, installer): installer_file = os.path.abspath(os.path.basename (installer)) if not os.path.isfile (installer_file): print 'Downloading', installer, '..', installer_file = download (installer) if installer_file is None: print 'Download FAILED' return False print 'DONE' install_path = self.get_install_path(installer_file) if install_path is not None: if not os.path.isdir (install_path): os.makedirs (install_path) else: install_path = '.' content = extract(installer_file, install_path) if not content: return False cwd = install_path for p in content: if os.path.isdir(p): cwd = p break return self.install_source(cwd)
def get_and_install_source(self, source): source_file = os.path.abspath(os.path.basename (source)) if not os.path.isfile (source_file): print 'Downloading', source, '..', source_file = download (source) if source_file is None: print 'Download FAILED' return False print 'DONE' source_path = self.get_source_path(source_file) if source_path is not None: if not os.path.isdir (source_path): os.makedirs (source_path) else: source_path = '.' content = extract(source_file, source_path) if not content: return False cwd = source_path for p in content: if os.path.isdir(p): cwd = p break return self.install_source(os.path.abspath(cwd))
def install(): fetch('http://piumarta.com/software/peg/peg-%(peg)s.tar.gz') extract('peg-%(peg)s.tar.gz') make('peg-%(peg)s', 'CC=clang') make('peg-%(peg)s', 'PREFIX=%s install' % env.prefix)
def execute(self, args=None): validTemplateNames = ['helloworld', 'helloworldwebapp', 'pale'] + customStarterApps if not args: print self.shorthelp print 'available app templates:' print 'helloworld -- simple helloworld app' print 'helloworldwebapp -- simple helloworld app using webapp fmk' print 'xmppsendandreply -- simple xmpp (instant message) send and reply' print 'emailreceive -- simple e-mail receive example' print 'emailsendui -- simple e-mail send example' print 'deferredemail -- simple deferred lib queued e-mail send example' print 'starter_pale -- a basic project layout with buckets for most things you could want and an import fix built in' else: templateName = args[0].lower() if templateName not in validTemplateNames: print 'Unknown app name %s' % args[0] return if templateName in customStarterApps: tarballurl = 'http://github.com/mpstx/appengine_py_%s/tarball/master' % templateName tmpPath = join(join(alePath('tmp'), templateName + '.tar.gz')) download(tarballurl, '%s.tar.gz' % templateName) logging.info("Extracting %s here" % templateName) os.system('tar xzf %s --strip 1 -C .' % tmpPath) elif templateName == 'helloworld': logging.info('creating ./helloworld.py') FILE = open('./helloworld.py', 'w') FILE.write(""" print 'Content-Type: text/plain' print '' print 'Hello, world! This is a bare bones app engine application' """) FILE.close() logging.info('creating ./app.yaml') FILE = open('./app.yaml', 'w') FILE.write(""" application: helloworld version: 1 runtime: python api_version: 1 handlers: - url: /.* script: helloworld.py """) FILE.close() elif templateName == 'helloworldwebapp': logging.info('creating ./helloworld.py') FILE = open('./helloworld.py', 'w') FILE.write(""" from google.appengine.ext import webapp from google.appengine.ext.webapp.util import run_wsgi_app class MainPage(webapp.RequestHandler): def get(self): self.response.headers['Content-Type'] = 'text/plain' self.response.out.write('Hello, webapp World!') application = webapp.WSGIApplication( [('/', MainPage)], debug=True) def main(): run_wsgi_app(application) if __name__ == "__main__": main() """) FILE.close() logging.info('creating ./app.yaml') FILE = open('./app.yaml', 'w') FILE.write(""" application: helloworldwebapp version: 1 runtime: python api_version: 1 handlers: - url: /.* script: helloworld.py """) FILE.close() else: pkgPath = join(join(alePath('recipes_installed'), 'createapp'), 'pkgs') templateZipPath = join(pkgPath, '%s.zip' % templateName) if os.path.exists(templateZipPath): extract(templateZipPath, '.') gitignore('tmp') else: logging.error('Could not find template: %s' % templateName) return return 0
def install(): fetch("http://www.fastcgi.com/dist/fcgi-%(fcgi)s.tar.gz") extract("fcgi-%(fcgi)s.tar.gz") configure("fcgi-%(fcgi)s", ["--prefix=%s" % env.prefix]) make("fcgi-%(fcgi)s") make("fcgi-%(fcgi)s", "install")
import sys import time import math import utils from pyspark.context import SparkContext if (len(sys.argv) > 1): hdfs_file_path = "/user/lsde02/data/%s/*.gz" % sys.argv[1] else: hdfs_file_path = "/user/lsde02/data/1901/*.gz" hdfs_results_path = "/user/lsde02/results/" start_time = time.strftime("%Y-%m-%d-%H-%M-%S") sc = SparkContext() context = sc.textFile(hdfs_file_path) stations = context.flatMap(lambda x: [utils.extract(record) for record in x.splitlines()]) stations = stations.filter(lambda x: 'longitude' in x[1] and 'latitude' in x[1]) stations.persist() # Do computations on month level month_data = stations.map(lambda x:((x[0][0], x[0][1], x[0][3]), (x[1]['temp'], x[1]['wind-speed'], x[1]['sky-condition'], x[1]['visibility'], \ x[1]['wind-direction']))) month_data = month_data.combineByKey(lambda value: (x['temp'], 1, x['wind-speed'], 1, x['sky-condition'], 1, x['visibility'], 1, \ math.sin(x['wind-direction'])*math.pi/180., math.cos(x['wind-direction']*math.pi/180.)),\ lambda x, value: (x[0] + value[0], value[1] + 1, x[2]+value[2], 1 + value[3], x[4] + value[4], 1 + value[5],\ x[6]+value[6], 1 + value[7], x[8] + value[8], x[9] + value[9]),\ lambda x, y: (x[0]+y[0], x[1]+y[1], x[2]+y[2], x[3]+y[3], x[4]+y[4], x[5]+y[5], x[6]+y[6], x[7]+y[7], x[8]+y[8]\ x[9]+y[9])) month_data = month_data.map(lambda (label, (x1, c1, x2, c2, x3, c3, x4, c4, x5a, x5b)): (label, (x1/c1, x2/c2, x3/c3, x4/c4, math.atan2(x5a, x5b)))) month_data = month_data.coalesce(1, True) month_avg.saveAsTextFile("%s%s-%s" % (hdfs_results_path, start_time, 'all'))
def install(self, args=None): dlFile = download('http://ipython.scipy.org/dist/0.10/ipython-0.10.tar.gz', 'ipython-0.10.tar.gz') extract(dlFile, extractPath)
def _be(self, *keys): """ Private conviniance wrapper around extract. Hash defaults to self._hash """ return extract(self._hash, *keys)
def __init__(self, bug, hash): """ Initialize attachments :arg hash: Dictionary of attachment details :arg bug: Instance of :class:`~bz_xmlrpc.classes.Bug` object :return: Instance of :class:`Attachment` .. note:: No need to use this directly. Use :meth:`~bz_xmlrpc.classes.Bug.get_attachments()` """ self._hash = hash self.id = extract(hash, "id", "attach_id") self.content_type = extract(hash, "content_type", "mimetype") self.creation_time = to_datetime(extract(hash, "creation_time", "creation_ts")) self.attacher = extract(hash, "attacher", "submitter_id") self.description = extract(hash, "description") self.file_name = extract(hash, "file_name", "filename") self.bug = bug self.is_private = bool(extract(hash, "is_private", "isprivate")) self.is_obsolete = bool(extract(hash, "is_obsolete", "isobsolete")) self.is_patch = bool(extract(hash, "is_patch", "ispatch")) self.is_url = bool(extract(hash, "is_url", "isurl")) self.last_change_time = to_datetime(extract(hash, "last_change_time", "modification_time")) if self.id and self.bug: self.fetch_url = bug.bz.url.replace("xmlrpc.cgi", "attachment.cgi?id=%s" % self.id)
def install(self, args=None): dlFile = download('http://github.com/ishikawa/modipyd/zipball/release-1-1-rc1', 'ishikawa-modipyd.zip') extract(dlFile, extractPath) os.system('chmod +x %s' % join(join(join(extractPath, 'ishikawa-modipyd-1516eeb'), 'bin'), 'modipyd')) os.system('chmod +x %s' % join(join(join(extractPath, 'ishikawa-modipyd-1516eeb'), 'bin'), 'pyautotest'))