def parse_legislator_row(chamber, session, row): cells = row("td") party = get_text(cells[-1]) district = get_text(cells[-2]) name_cell = cells[0].contents if not name_cell: return None linked_name = name_cell[0] first_name = middle_name = last_name = full_name = suffix = "" try: link = linked_name['href'] match = MEMBER_ID_PATTERN.match(link) member_id = match.groups()[0] url = urljoin(MEMBER_LIST_URL[chamber], link) full_name = " ".join(linked_name.contents) # a list if full_name.find(",") != -1: (name, suffix) = full_name.split(",") else: name = full_name suffix = "" name_parts = name.split() if len(name_parts) == 2: (first_name, last_name) = name_parts elif len(name_parts) > 3: (first_name, middle_name) = name_parts[:2] last_name = " ".join(name_parts[2:]) elif len(name_parts) == 3: first_name, middle_name, last_name = name_parts else: raise ValueError("Unexpected number of parts to %s" % full_name) except KeyError, e: return None
def parse_legislator_row(chamber, session, row): cells = row("td") party = get_text(cells[-1]) district = get_text(cells[-2]) name_cell = cells[0].contents if not name_cell: return None linked_name = name_cell[0] first_name = middle_name = last_name = full_name = suffix = "" try: link = linked_name['href'] match = MEMBER_ID_PATTERN.match(link) member_id = match.groups()[0] url = urljoin(MEMBER_LIST_URL[chamber],link) full_name = " ".join(linked_name.contents) # a list if full_name.find(",") != -1: (name,suffix) = full_name.split(",") else: name = full_name suffix = "" name_parts = name.split() if len(name_parts) == 2: (first_name,last_name) = name_parts elif len(name_parts) > 3: (first_name,middle_name) = name_parts[:2] last_name = " ".join(name_parts[2:]) elif len(name_parts) == 3: first_name,middle_name,last_name = name_parts else: raise ValueError("Unexpected number of parts to %s" % full_name) except KeyError, e: return None
def testEntry(self): etree = fromstring(ENTRY2) util.set_text(etree, 'content', 'html', '<p>hello</p>') self.assertEqual(("html", "<p>hello</p>"), util.get_text('content', etree)) util.set_text(etree, 'title', 'xhtml', '<p>hello</p>') self.assertEqual(("xhtml", "<p>hello</p>"), util.get_text('title', etree)) util.set_text(etree, 'summary', 'text', '<p>hello</p>') self.assertEqual(("text", "<p>hello</p>"), util.get_text('summary', etree))
def parse_item(self, response): source = 'wandoujia' name = util.get_text(response, '//p[@class="app-name"]/span/text()') if not name: return version = util.get_text(response, '//dl[@class="infos-list"]/dd[4]/text()') first = response.meta['first'] second = util.get_text(response, '//div[@class="crumb"]/div[2]/a/span/text()') category = first + '-' + second time = util.get_text(response, '//time[@id="baidu_time"]/text()') size = util.get_text(response, '//dl[@class="infos-list"]/dd[1]/text()') system = util.get_text(response, '//dl[@class="infos-list"]/dd[5]/text()') text = util.get_text(response, '//div[@itemprop="description"]',0) download = util.get_text(response, '//i[@itemprop="interactionCount"]/@content') pingfen = '' tag = response.xpath('//dd[@class="tag-box"]//a/text()').extract() tags=','.join([i.strip() for i in tag]) self.fileout.write( source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags ) self.fileout.write('\n')
def parse(self, response): applist = response.xpath("//dl[@class='down_list pd20']") for app in applist: app_url = util.get_text(app, "./dd[@class='down_title']/h2/a/@href") time = util.get_text(app, "./dd[@class='down_attribute align_l']/span[3]/text()") system = util.get_text(app, "./dd[@class='down_attribute align_l']/span[5]/text()") download = util.get_text(app, "./dd[@class='down_attribute align_l']/span[7]/text()") yield scrapy.Request( app_url, callback=self.parse_item, meta={"time": time, "system": system, "download": download} ) next = response.xpath('//a[text()="下一页"]/@href').extract() if next: yield scrapy.Request(next[0], callback=self.parse)
def parse_item(self, response): source = 'anzhi' name = util.get_text(response, '//div[@class="detail_line"]/h3//text()') if not name: return version = util.get_text(response, '//div[@class="detail_line"]/span//text()')[1:-1] first = response.meta['cate'] data = response.xpath('//ul[@id="detail_line_ul"]/li//text()').extract() if len(data) == 7 : second = data[0][3:] download = data[1][3:] time = data[2][3:] size = data[3][3:] system = data[4][3:] if len(data) == 6 : second = data[0][3:] download = '' time = data[1][3:] size = data[2][3:] system = data[3][3:] category = first + '-' + second text = util.get_text(response, '//div[@class="app_detail_infor"]',0) pingfen = util.get_text(response, '//div[@id="stars_detail"]/@style') p = pingfen.split('-') if len(p) == 2: pingfen = '0.0' elif len(p) == 3: pingfen = p[2][:-3] try: pingfen = str(float(pingfen)/15*10) except Exception: pingfen ='' tags='' self.fileout.write( source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags ) self.fileout.write('\n')
def parse_item(self, response): source = 'hiapk' name_and_version = util.get_text(response, "//div[@id='appSoftName']/text()") try: version = name_and_version.split('(')[1].split(')')[0] name = name_and_version.split('(')[0] except Exception: version = '' name = name_and_version if not name: return first = util.get_text(response, "//a[@id='categoryParent']/text()") second = util.get_text(response, "//a[@id='categoryLink']/text()") category = first + '-' + second time = util.get_text( response, '//div[@class="code_box_border"]/div[@class="line_content"][7]/span[2]/text()' ) size = util.get_text(response, '//span[@id="appSize"]/text()') system = util.get_text( response, '//span[@class="font14 detailMiniSdk d_gj_line left"]/text()') text = util.get_text(response, '//pre[@id="softIntroduce"]', 0) download = util.get_text( response, '//div[@class="code_box_border"]/div[@class="line_content"][2]/span[2]/text()' ) pingfen = util.get_text(response, '//div[@id="appIconTips"]/div[1]/@class') try: pingfen = str(float(pingfen.split(" ")[2].split("_")[2]) * 2) except Exception: pingfen = '' tags = '' self.fileout.write(source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags) self.fileout.write('\n')
def parse_item(self, response): source = 'applestore' name = util.get_text( response, '//div[@id="desktopContentBlockId"]//div[@id="title"]//h1/text()') if not name: return version = util.get_text( response, '//div[@id="left-stack"]//span[@itemprop="softwareVersion"]/text()' ) first = '软件' second = util.get_text( response, '//div[@id="left-stack"]//span[@itemprop="applicationCategory"]/text()' ) category = first + '-' + second time = util.get_text( response, '//div[@id="left-stack"]//span[@itemprop="datePublished"]/text()') size = '' system = util.get_text( response, '//div[@id="left-stack"]//span[@itemprop="operatingSystem"]/text()' ) text = util.get_text( response, '//div[@class="center-stack"]/div[@class="product-review"]/p', 0) download = util.get_text( response, '//div[@class="extra-list customer-ratings"]/div[4]/span/text()') pingfen = util.get_text( response, '//div[@class="extra-list customer-ratings"]/div[4]/@aria-label') try: pingfen = str(float(pingfen.split('星, ')[0]) * 20) except Exception: pingfen = '' tags = '' self.fileout.write(source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags) self.fileout.write('\n')
def parse(self, response): applist = response.xpath('//p[@class="f-s3 t-overflow"]/a/@href').extract() for app in applist: yield scrapy.Request('http://m.163.com'+app, callback = self.parse_item) next = util.get_text(response, '//a[text()="下一页"]/@href') if next != '#next': yield scrapy.Request('http://m.163.com'+next, callback = self.parse)
def parse_page(self, response): app_list = response.xpath( '//div[@class="topic_before"]/a/@href').extract() cat = util.get_text(response, '//div[@class="l_box_title"]/h3/text()') for i in app_list: yield scrapy.Request('http://apk.91.com' + i, callback=self.parse_item, meta={'cat': cat})
def extract_actions(s): actions = [] anchor = s("a",{'name':'actions'})[0] table = None for x in anchor.nextGenerator(): if hasattr(x,'name') and getattr(x,'name') == 'table': table = x break if table: cells = table("td", { "class": "content" }) # markup bad: only header row correctly wrapped in a "tr"! while cells: (date,chamber,action) = cells[0:3] date = get_text(date).replace(" "," ").strip() chamber = standardize_chamber(get_text(chamber).lower()) cells = cells[3:] action = get_text(action) actions.append((chamber,action,date)) return actions
def parse(self, response): cate = util.get_text(response, '//li[@class="current"]/a/text()')[2:] app_list = response.xpath('//div[@class="app_list border_three"]//div[@class="app_info"]//a/@href').extract() for i in app_list: yield scrapy.Request('http://www.anzhi.com' + i, callback=self.parse_item, meta={'cate':cate}) next = response.xpath('//div[@class="pagebars"]//a[@class="next"]/@href').extract() if next: yield scrapy.Request('http://www.anzhi.com' + next[0], callback=self.parse)
def parse_item(self, response): source = 'yingyonghui' name = util.get_text(response, '//h1[@class="app-name"]/text()') if not name: return version = util.get_text(response, '//div[@class="intro"]/p[1]/text()[2]')[3:] first = util.get_text( response, '//div[@class="breadcrumb centre-content"]/a[2]/text()') second = util.get_text( response, '//div[@class="breadcrumb centre-content"]/a[3]/text()') category = first + '-' + second time = util.get_text(response, '//div[@class="intro"]/p[1]/text()')[3:] size = util.get_text(response, '//span[@class="app-statistic"]/text()[2]') try: size = size.split('大小:')[1].split(' 更新')[0] except Exception: size = '' system = util.get_text(response, '//p[@class="art-content"][3]/text()[4]')[3:] text = util.get_text(response, '//div[@class="main-info"]/p[1]', 0) download = util.get_text(response, '//span[@class="app-statistic"]/text()') try: download = download.split('下载')[0] except Exception: download = '' pingfen = '' tags = '' self.fileout.write(source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags) self.fileout.write('\n')
def parse_item(self, response): source = 'apk91' name = util.get_text(response, '//h1[@class="ff f20 fb fl"]/text()') if not name: return version = util.get_text(response, '//ul[@class="s_info"]/li[1]/text()')[3:] first = util.get_text(response, '//div[@class="crumb clearfix"]/a[2]/text()') second = response.meta['cat'] category = first + '-' + second time = util.get_text(response, '//ul[@class="s_info"]/li[5]/text()')[5:15] size = util.get_text(response, '//ul[@class="s_info"]/li[3]/text()')[5:] system = util.get_text(response, '//ul[@class="s_info"]/li[4]/text()')[5:] text = util.get_text(response, '//div[@class="o-content"]', 0) download = util.get_text(response, '//ul[@class="s_info"]/li[2]/text()') pingfen = util.get_text( response, '//div[@class="s_intro_pic fl"]/span[@class="spr star"]/a/@class') try: pingfen = str(float(pingfen.split('w')[1].split(' ')[0]) * 20) except Exception: pingfen = '' tag = response.xpath('//ul[@class="s_info"]/li[10]/a/text()').extract() tags = ','.join(tag) self.fileout.write(source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags) self.fileout.write('\n')
def parse(self, response): applist = response.xpath( '//p[@class="f-s3 t-overflow"]/a/@href').extract() for app in applist: yield scrapy.Request('http://m.163.com' + app, callback=self.parse_item) next = util.get_text(response, '//a[text()="下一页"]/@href') if next != '#next': yield scrapy.Request('http://m.163.com' + next, callback=self.parse)
def main(): parser = argparse.ArgumentParser(description="Remove duplicate exception aliases.") parser.add_argument('file', metavar='FILE', type=str, help="File to remove duplicates from.") parser.add_argument('--output', dest='output', action='store', default='FILE', help="File to write result to. (default: %(default)s)") parser.add_argument('--write', dest='write', action='store_true', help="Disable prompt, write to OUTPUT file automatically") args = parser.parse_args() if args.output == 'FILE': args.output = args.file print "file", '"' + args.file + '"' print "output", '"' + args.output + '"' print "write", args.write print if not os.path.exists(args.file): print "ERROR: file", '"' + args.file + '"', "does not exist" return data = remove_duplicates(args.file) output_original = get_text(args.output) data_len = len(data.replace('\n', '').replace('\r', '')) output_original_len = len(output_original.replace('\n', '').replace('\r', '')) print abs(data_len - output_original_len), "bytes changed" if data_len - output_original_len == 0: print "nothing to save" return print 'validating data:', if not validate(data, trace=True): return if not args.write: args.write = raw_input('Write to "' + args.output + '"? [no]: ') == 'yes' if args.write: f = open(args.output, 'wb') f.write(data) f.close() print "done" else: print 'not writing data'
def parse(self, response): applist = response.xpath("//dl[@class='down_list pd20']") for app in applist: app_url = util.get_text(app, "./dd[@class='down_title']/h2/a/@href") time = util.get_text( app, "./dd[@class='down_attribute align_l']/span[3]/text()") system = util.get_text( app, "./dd[@class='down_attribute align_l']/span[5]/text()") download = util.get_text( app, "./dd[@class='down_attribute align_l']/span[7]/text()") yield scrapy.Request(app_url, callback=self.parse_item, meta={ "time": time, "system": system, "download": download }) next = response.xpath('//a[text()="下一页"]/@href').extract() if next: yield scrapy.Request(next[0], callback=self.parse)
def parse_item(self, response): source = '3310' name_version = util.get_text(response, '//div[@class="cont"]/h2/text()') if not name_version: return ns = name_version.split(' ') version = ns.pop(-1) name = ' '.join(ns) first = util.get_text(response, '//div[@class="guide"]/a[3]/text()') second = util.get_text(response, '//div[@class="guide"]/a[4]/text()') category = first + '-' + second time = util.get_text(response, '//div[@class="cont"]/p[2]/text()')[5:] size = util.get_text(response, '//div[@class="cont"]/p[1]/span/text()')[3:] system = util.get_text(response, '//div[@class="cont"]/p[3]/span/text()')[5:] text = util.get_text(response, '//div[@class="pictxt item"][not(@style)]', 0) download = util.get_text(response, '//span[@id="downnum"]/text()') pingfen = util.get_text(response, '//div[@class="score"]/span/text()') try: pingfen = str(float(pingfen) * 20) except Exception: pingfen = '' tags = '' self.fileout.write(source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags) self.fileout.write('\n')
def parse_item(self, response): source = "anzow" name = util.get_text(response, "//dl[@class='down_info clear']/dd/div[1]/h1/text()") if not name: return version = "" first = util.get_text(response, "//div[@class='crumbs fl']/a[2]/text()")[-2:] second = util.get_text(response, "//div[@class='crumbs fl']/a[3]/text()") category = first + "-" + second time = response.meta["time"] size = util.get_text(response, '//div[@class="xiazai1"][1]/../dl/dt/ul/li[3]/text()') system = response.meta["system"] text = util.get_text(response, '//div[@class="down_intro"]', 0) download = response.meta["download"] pingfen = util.get_text(response, '//dl[@class="down_info clear"]/dd/dl/dt/ul/li[7]/strong/text()') try: pingfen = str(pingfen.count("★") * 20) except Exception: pingfen = "" tag = response.xpath('//p[@class="keywords"]//a/text()').extract() tags = ",".join(tag) self.fileout.write( source + "\001" + name + "\001" + version + "\001" + category + "\001" + util.unify_data(time) + "\001" + size + "\001" + system + "\001" + text + "\001" + util.unify_download_count(download) + "\001" + pingfen + "\001" + tags ) self.fileout.write("\n")
def parse_item(self, response): source = 'yingyonghui' name = util.get_text(response, '//h1[@class="app-name"]/text()') if not name: return version = util.get_text(response, '//div[@class="intro"]/p[1]/text()[2]')[3:] first = util.get_text(response, '//div[@class="breadcrumb centre-content"]/a[2]/text()') second = util.get_text(response, '//div[@class="breadcrumb centre-content"]/a[3]/text()') category = first + '-' + second time = util.get_text(response, '//div[@class="intro"]/p[1]/text()')[3:] size = util.get_text(response, '//span[@class="app-statistic"]/text()[2]') try: size = size.split('大小:')[1].split(' 更新')[0] except Exception: size = '' system = util.get_text(response, '//p[@class="art-content"][3]/text()[4]')[3:] text = util.get_text(response, '//div[@class="main-info"]/p[1]',0) download = util.get_text(response, '//span[@class="app-statistic"]/text()') try: download = download.split('下载')[0] except Exception: download = '' pingfen = '' tags='' self.fileout.write( source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags ) self.fileout.write('\n')
def parse_item(self, response): source = '163' name = util.get_text(response, '//span[@class="f-h1"]/text()') if not name: return version = util.get_text( response, '//table[@class="table-appinfo"]/tr[3]/td/text()') first = util.get_text( response, "//div[@class='sect']/div[@class='crumb']/a[2]/text()")[-2:] second = util.get_text( response, "//div[@class='sect']/div[@class='crumb']/a[3]/text()") category = first + '-' + second time = '' size = util.get_text( response, '//table[@class="table-appinfo"]/tr[2]/td[1]/text()') system = '' text = util.get_text(response, '//div[@id="app-desc"]', 0) download = util.get_text(response, '//span[@class="vote-text-s"]/text()')[1:-1] pingfen = util.get_text(response, '//span[@class="vote-column-s"]/i/@style') try: pingfen = pingfen.split(':')[1].split('%')[0] except Exception: pingfen = '' tags = '' self.fileout.write(source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags) self.fileout.write('\n')
def parse_item(self, response): source = 'hiapk' name_and_version = util.get_text(response, "//div[@id='appSoftName']/text()") try: version = name_and_version.split('(')[1].split(')')[0] name = name_and_version.split('(')[0] except Exception: version = '' name = name_and_version if not name: return first = util.get_text(response, "//a[@id='categoryParent']/text()") second = util.get_text(response, "//a[@id='categoryLink']/text()") category = first + '-' + second time = util.get_text(response, '//div[@class="code_box_border"]/div[@class="line_content"][7]/span[2]/text()') size = util.get_text(response, '//span[@id="appSize"]/text()') system = util.get_text(response, '//span[@class="font14 detailMiniSdk d_gj_line left"]/text()') text = util.get_text(response, '//pre[@id="softIntroduce"]',0) download = util.get_text(response, '//div[@class="code_box_border"]/div[@class="line_content"][2]/span[2]/text()') pingfen = util.get_text(response, '//div[@id="appIconTips"]/div[1]/@class') try: pingfen = str(float(pingfen.split(" ")[2].split("_")[2])*2) except Exception: pingfen ='' tags='' self.fileout.write( source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags ) self.fileout.write('\n')
def parse_item(self, response): source = '3310' name_version = util.get_text(response, '//div[@class="cont"]/h2/text()') if not name_version: return ns = name_version.split(' ') version = ns.pop(-1) name = ' '.join(ns) first = util.get_text(response, '//div[@class="guide"]/a[3]/text()') second = util.get_text(response, '//div[@class="guide"]/a[4]/text()') category = first + '-' + second time = util.get_text(response, '//div[@class="cont"]/p[2]/text()')[5:] size = util.get_text(response, '//div[@class="cont"]/p[1]/span/text()')[3:] system = util.get_text(response, '//div[@class="cont"]/p[3]/span/text()')[5:] text = util.get_text(response, '//div[@class="pictxt item"][not(@style)]',0) download = util.get_text(response, '//span[@id="downnum"]/text()') pingfen = util.get_text(response, '//div[@class="score"]/span/text()') try: pingfen = str(float(pingfen)*20) except Exception: pingfen ='' tags='' self.fileout.write( source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags ) self.fileout.write('\n')
def parse_item(self, response): source = 'apk91' name = util.get_text(response, '//h1[@class="ff f20 fb fl"]/text()') if not name: return version = util.get_text(response, '//ul[@class="s_info"]/li[1]/text()')[3:] first = util.get_text(response, '//div[@class="crumb clearfix"]/a[2]/text()') second = response.meta['cat'] category = first + '-' + second time = util.get_text(response, '//ul[@class="s_info"]/li[5]/text()')[5:15] size = util.get_text(response, '//ul[@class="s_info"]/li[3]/text()')[5:] system = util.get_text(response, '//ul[@class="s_info"]/li[4]/text()')[5:] text = util.get_text(response, '//div[@class="o-content"]',0) download = util.get_text(response, '//ul[@class="s_info"]/li[2]/text()') pingfen = util.get_text(response, '//div[@class="s_intro_pic fl"]/span[@class="spr star"]/a/@class') try: pingfen = str(float(pingfen.split('w')[1].split(' ')[0])*20) except Exception: pingfen ='' tag = response.xpath('//ul[@class="s_info"]/li[10]/a/text()').extract() tags=','.join(tag) self.fileout.write( source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags ) self.fileout.write('\n')
def parse_item(self, response): source = '25pp' name = util.get_text(response, '//div[@class="title-stat"]/div[@class="txt"]/h1/text()') if not name: return version = util.get_text(response, '//div[@class="title-stat"]/div[@class="txt"]/ul/li[1]/text()')[3:] first = util.get_text(response, '//div[@class="location"]/a[2]/text()') second = util.get_text(response, '//div[@class="title-stat"]/div[@class="txt"]/ul/li[2]/text()') category = first + '-' + second time = '' size = util.get_text(response, '//div[@class="title-stat"]/div[@class="txt"]/ul/li[3]/text()')[3:] system = util.get_text(response, '//div[@class="title-stat"]/div[@class="txt"]/ul/li[5]/text()')[5:] text = util.get_text(response, '//div[@class="conTxt"][1]',0) download = util.get_text(response, '//li[@class="borderR"]/span/text()') pingfen = util.get_text(response, '//div[@class="downMunber"]/ul/li[3]/span/text()') try: pingfen = str(float(pingfen)*20) except Exception: pingfen ='' tag = response.xpath('//li[@class="w-450"]//a/text()').extract() tags=','.join(tag) self.fileout.write( source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags ) self.fileout.write('\n')
def parse_item(self, response): source = '360' name = util.get_text(response, '//h2[@id="app-name"]/span/text()') if not name: return version = util.get_text(response, '//div[@class="breif"]/div[@class="base-info"]/table/tbody/tr[2]/td[1]/text()') first = util.get_text(response, '//div[@class="nav"]/ul/li[@class="cur"]/a/text()')[1:] second = response.meta['categroy'] category = first + '-' + second time = util.get_text(response, '//div[@class="breif"]/div[@class="base-info"]/table/tbody/tr[1]/td[2]/text()') size = util.get_text(response, '//div[@class="pf"]/span[@class="s-3"][2]/text()') system = util.get_text(response, '//div[@class="breif"]/div[@class="base-info"]/table/tbody/tr[2]/td[2]/text()') text = util.get_text(response, '//div[@class="breif"]',0) download = util.get_text(response, '//div[@class="pf"]/span[@class="s-3"][1]/text()') pingfen = util.get_text(response, '//div[@class="pf"]/span[@class="s-1 js-votepanel"]/text()') try: pingfen = str(float(pingfen)*10) except Exception: pingfen ='' tag = response.xpath('//div[@class="app-tags"]//a/text()').extract() tags=','.join([i.strip() for i in tag]) self.fileout.write( source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags ) self.fileout.write('\n')
def parse(self, response): source = 'baidu' name = util.get_text(response, '//span[@class="gray"]/text()') if not name: return version = util.get_text(response, '//span[@class="version"]/text()')[3:] first = util.get_text(response, '//div[@class="nav"]//a/text()') second = util.get_text(response, '//div[@class="nav"]/span[3]/a/text()') category = first + '-' + second time = '' size = util.get_text(response, '//span[@class="size"]/text()')[3:] system = '' text = util.get_text(response, '//div[@class="brief-long"]/p', 0) download = util.get_text(response, '//span[@class="download-num"]/text()')[5:] pingfen = util.get_text(response, '//span[@class="star-percent"]/@style') try: pingfen = pingfen.split(':')[1].split('%')[0] except Exception: pingfen = '' tags = '' self.fileout.write(source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags) self.fileout.write('\n')
def merge(localPath, remotePath): items = {} key_order = [] changes = {} for path in [localPath, remotePath]: for (li, tvdb_id, sep, alias_list, line) in parse_data(get_text(path)): if not validate_tvdb_id(tvdb_id): continue if not items.has_key(tvdb_id): items[tvdb_id] = [] key_order.append(tvdb_id) for alias in alias_list: alias = alias.strip().replace("'", "\\'") if not find_match(alias, items[tvdb_id]): items[tvdb_id].append(alias) # track remote changes if path == remotePath: if not changes.has_key(tvdb_id): changes[tvdb_id] = [] changes[tvdb_id].append(alias) print "----------------------------------------------------------" print "New Shows" print "----------------------------------------------------------" for ck, added in changes.items(): if items[ck] == added: print str(ck) + '\tnew\t\t' + str(added) print "----------------------------------------------------------" print "New Aliases" print "----------------------------------------------------------" for ck, added in changes.items(): if items[ck] != added: print str(ck) + '\tadd\t\t' + str(added) print '=============\t', items[ck] print return dict_to_data(items, key_order)
def main(): parser = argparse.ArgumentParser(description="Validate file is in the correct form.") parser.add_argument('file', metavar='FILE', type=str, help="LOCAL file to validate.") args = parser.parse_args() if not os.path.exists(args.file): print "ERROR: file", '"' + args.file + '"', "does not exist" return print "file", '"' + args.file + '"' print valid = validate(get_text(args.file), trace=True) if not valid: sys.exit(1)
def train(input_paths, encoding, output_path): """ :param input_paths: a list of one string representing the input path to a metadata file. Each line of the file contains class name and path to input file separated by whitespace. """ input_metadata = input_paths[0] doc_tokens = [] doc_labels = [] class_freqs = collections.defaultdict(int) word_freqs = collections.defaultdict(int) class_sizes = collections.defaultdict(int) vocabulary = set() with open(input_metadata, "r") as metadata_file: for line in metadata_file: doc_label, doc_path = line.strip().split() with open(doc_path, "r") as doc_file: tokens = util.tokenize(util.get_text(doc_file, encoding)) doc_labels.append(doc_label) doc_tokens.append(tokens) class_freqs[doc_label] += 1 class_sizes[doc_label] += len(doc_tokens) for word in tokens: vocabulary.add(word) word_freqs[(doc_label, word)] += 1 class_freqs = { c: f / float(len(doc_tokens)) for c, f in class_freqs.iteritems() } model = NaiveBayesModel(class_freqs, word_freqs, class_sizes, vocabulary) with open(output_path, "w") as output_file: pickle.dump(model, output_file)
def classify(self, input_path, encoding): """ :return: a tuple of (class label, weights) where ``weights`` is the list of (class label, weight) tuples """ with open(input_path, "r") as input_file: doc_tokens = util.tokenize(util.get_text(input_file, encoding)) weights = self.doc_in_class_probabilities(doc_tokens, self.class_freqs.keys()) best_weight = None for label, weight in weights: if best_weight is None or weight > best_weight: best_weight = weight best_label = label return (best_label, weights)
def remove_duplicates(path): items = {} key_order = [] changes = {} for (li, tvdb_id, sep, alias_list, line) in parse_data(get_text(path)): if not validate_tvdb_id(tvdb_id): continue if not items.has_key(tvdb_id): items[tvdb_id] = [] key_order.append(tvdb_id) for alias in alias_list: alias = alias.strip().replace("'", "\\'") if not find_match(alias, items[tvdb_id]): items[tvdb_id].append(alias) return dict_to_data(items, key_order)
def parse_item(self, response): source = 'mumayi' name_version = util.get_text( response, '//h1[@class="iappname hidden fl"]/text()') if not name_version: return sn = name_version.split('V') version = sn.pop(-1) if len(sn) > 1 else '' name = 'V'.join(sn) if sn else name_version first = util.get_text(response, '//div[@id="classlists"]/a[2]/text()')[:2] second = util.get_text(response, '//div[@id="classlists"]/a[3]/text()') category = first + '-' + second time = response.meta['time'] size = util.get_text(response, '//span[text()="程序大小:"]/../text()') system = util.get_text(response, '//div[@class="sel_text fl"]/text()') text = util.get_text(response, '//ul[@class="author"]/..//p[position()<last()]', 0) download = '' pingfen = util.get_text(response, '//div[@id="starlist"]/@class') try: pingfen = str(float(pingfen.split('now')[1]) * 2) except Exception: pingfen = '' tags = '' self.fileout.write(source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags) self.fileout.write('\n')
def parse_item(self, response): source = 'yingyongbao' name = response.meta['name'] if not name: return version = response.meta['version'] # first = response.meta['first'] second = response.meta['category'] category = response.meta['category'] t = response.meta['time'] import time try: st = time.strftime('%Y-%m-%d', time.localtime(t)) except Exception: st = '' size = response.meta['size'] size = str(size / 1000000) + 'M' system = '' text = util.get_text(response, '//div[@class="det-intro-text"]', 0) download = str(response.meta['appdown']) pingfen = str(response.meta['pingfen']) tags = '' self.fileout.write(source + '\001' + name + '\001' + version + '\001' + category + '\001' + st + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags) self.fileout.write('\n')
def parse_bill(scraper, url): """Given a bill status URL, return a fully loaded Bill object, except for votes, which are expected to be handled externally. """ session = extract_session(url) chamber = chamber_for_doctype(extract_doctype(url)) s = get_soup(scraper, url) bill_id = extract_bill_id(s) landmark = s(text=re.compile(".*Short Description.*")) name_span = landmark[0].findParent().findNextSibling() bill_name = get_text(name_span) bill = Bill(session, chamber, bill_id, bill_name.strip(),status_url=url) actions = extract_actions(s) for chamber,action,date in actions: bill.add_action(chamber,action,date) #kwargs are permitted if we have 'em. sponsor_dict = extract_sponsors_from_actions([action[1] for action in actions]) for type,namelist in sponsor_dict.iteritems(): for name in namelist: bill.add_sponsor(type,name) for name,link in extract_versions(scraper, s): bill.add_version(name,link) return bill
def parse_item(self, response): source = 'applestore' name = util.get_text(response, '//div[@id="desktopContentBlockId"]//div[@id="title"]//h1/text()') if not name: return version = util.get_text(response, '//div[@id="left-stack"]//span[@itemprop="softwareVersion"]/text()') first = '软件' second = util.get_text(response, '//div[@id="left-stack"]//span[@itemprop="applicationCategory"]/text()') category = first + '-' + second time = util.get_text(response, '//div[@id="left-stack"]//span[@itemprop="datePublished"]/text()') size = '' system = util.get_text(response, '//div[@id="left-stack"]//span[@itemprop="operatingSystem"]/text()') text = util.get_text(response, '//div[@class="center-stack"]/div[@class="product-review"]/p',0) download = util.get_text(response, '//div[@class="extra-list customer-ratings"]/div[4]/span/text()') pingfen = util.get_text(response, '//div[@class="extra-list customer-ratings"]/div[4]/@aria-label') try: pingfen = str(float(pingfen.split('星, ')[0])*20) except Exception: pingfen ='' tags='' self.fileout.write( source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags ) self.fileout.write('\n')
def parse(self, response): source = 'baidu' name = util.get_text(response, '//span[@class="gray"]/text()') if not name: return version = util.get_text(response, '//span[@class="version"]/text()')[3:] first = util.get_text(response, '//div[@class="nav"]//a/text()') second = util.get_text(response, '//div[@class="nav"]/span[3]/a/text()') category = first + '-' + second time = '' size = util.get_text(response, '//span[@class="size"]/text()')[3:] system = '' text = util.get_text(response, '//div[@class="brief-long"]/p', 0) download = util.get_text(response, '//span[@class="download-num"]/text()')[5:] pingfen = util.get_text(response, '//span[@class="star-percent"]/@style') try: pingfen = pingfen.split(':')[1].split('%')[0] except Exception: pingfen ='' tags = '' self.fileout.write( source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags ) self.fileout.write('\n')
def parse_item(self, response): source = '163' name = util.get_text(response, '//span[@class="f-h1"]/text()') if not name: return version = util.get_text(response, '//table[@class="table-appinfo"]/tr[3]/td/text()') first = util.get_text(response, "//div[@class='sect']/div[@class='crumb']/a[2]/text()")[-2:] second = util.get_text(response, "//div[@class='sect']/div[@class='crumb']/a[3]/text()") category = first + '-' + second time = '' size = util.get_text(response, '//table[@class="table-appinfo"]/tr[2]/td[1]/text()') system = '' text = util.get_text(response, '//div[@id="app-desc"]',0) download = util.get_text(response, '//span[@class="vote-text-s"]/text()')[1:-1] pingfen = util.get_text(response, '//span[@class="vote-column-s"]/i/@style') try: pingfen = pingfen.split(':')[1].split('%')[0] except Exception: pingfen ='' tags = '' self.fileout.write( source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags ) self.fileout.write('\n')
def parse_item(self, response): source = 'yingyongbao' name = response.meta['name'] if not name: return version = response.meta['version'] # first = response.meta['first'] second = response.meta['category'] category = response.meta['category'] t = response.meta['time'] import time try: st = time.strftime('%Y-%m-%d',time.localtime(t)) except Exception: st = '' size = response.meta['size'] size = str(size/1000000)+'M' system = '' text = util.get_text(response, '//div[@class="det-intro-text"]',0) download = str(response.meta['appdown']) pingfen = str(response.meta['pingfen']) tags='' self.fileout.write( source + '\001' + name + '\001' + version + '\001' + category + '\001' + st + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags ) self.fileout.write('\n')
def parse_item(self, response): source = 'anzow' name = util.get_text( response, "//dl[@class='down_info clear']/dd/div[1]/h1/text()") if not name: return version = '' first = util.get_text(response, "//div[@class='crumbs fl']/a[2]/text()")[-2:] second = util.get_text(response, "//div[@class='crumbs fl']/a[3]/text()") category = first + '-' + second time = response.meta['time'] size = util.get_text( response, '//div[@class="xiazai1"][1]/../dl/dt/ul/li[3]/text()') system = response.meta['system'] text = util.get_text(response, '//div[@class="down_intro"]', 0) download = response.meta['download'] pingfen = util.get_text( response, '//dl[@class="down_info clear"]/dd/dl/dt/ul/li[7]/strong/text()') try: pingfen = str(pingfen.count('★') * 20) except Exception: pingfen = '' tag = response.xpath('//p[@class="keywords"]//a/text()').extract() tags = ','.join(tag) self.fileout.write(source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags) self.fileout.write('\n')
def parse_item(self, response): source = 'mumayi' name_version = util.get_text(response, '//h1[@class="iappname hidden fl"]/text()') if not name_version: return sn = name_version.split('V') version = sn.pop(-1) if len(sn)>1 else '' name = 'V'.join(sn) if sn else name_version first = util.get_text(response, '//div[@id="classlists"]/a[2]/text()')[:2] second = util.get_text(response, '//div[@id="classlists"]/a[3]/text()') category = first + '-' + second time = response.meta['time'] size = util.get_text(response, '//span[text()="程序大小:"]/../text()') system = util.get_text(response, '//div[@class="sel_text fl"]/text()') text = util.get_text(response, '//ul[@class="author"]/..//p[position()<last()]',0) download = '' pingfen = util.get_text(response, '//div[@id="starlist"]/@class') try: pingfen = str(float(pingfen.split('now')[1])*2) except Exception: pingfen ='' tags = '' self.fileout.write( source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags ) self.fileout.write('\n')
def parse_item(self, response): source = 'xiaomi' name = util.get_text(response, '//div[@class="intro-titles"]/h3/text()') if not name: return version = util.get_text(response, '//ul[@class=" cf"]/li[4]/text()') first = response.meta['first'] second = util.get_text(response, '//div[@class="bread-crumb"]/ul/li[2]/a/text()') category = first + '-' + second time = util.get_text(response, '//ul[@class=" cf"]/li[6]/text()') size = util.get_text(response, '//ul[@class=" cf"]/li[2]/text()') system = '' text = util.get_text(response, '//p[@class="pslide"]',0) download = '' pingfen = util.get_text(response, '//div[@class="star1-empty"]/div/@class') try: pingfen = str(float(pingfen.split('star1-hover star1-')[1])*10) except Exception: pingfen ='' tags='' self.fileout.write( source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags ) self.fileout.write('\n')
def parse(self, response): first = util.get_text(response, '//span[@class="last"]/text()')[2:4] cats = response.xpath('//ul[@class="clearfix tag-box"]//li/a/span/text()').extract() for cat in cats: yield scrapy.Request('http://apps.wandoujia.com/api/v1/apps?tag='+cat+'&max=60&start=0&opt_fields=apps.packageName', callback = self.parse_page, meta={'cat':cat,'first':first})
import torch.nn.functional as F from util import get_text, create_dicts, on_gpu, get_batches, one_hot_encode, write_file from model import RNN import sys device = on_gpu() # Declaring the hyperparameters batch_size = 128 seq_length = 100 n_epochs = 100 # start smaller if you are just testing initial behavior lr = 1e-3 files = sys.argv[1:] if len(sys.argv) > 1 else ['shakespeare'] filename = '-'.join(files) text = get_text(files) chars, int2char, char2int = create_dicts(text) # Encode the text data = np.array([char2int[ch] for ch in text]) net = RNN(chars).to(device) opt = torch.optim.Adam(net.parameters(), lr=lr) criterion = nn.CrossEntropyLoss() # Declaring the train method def train(epochs=20, clip=5, val_frac=0.1, print_every=100): global data net.train()
""" https://nlp100.github.io/ja/ch01.html#00-%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%80%86%E9%A0%86 """ from util import get_text if __name__ == '__main__': print(get_text())
""" https://nlp100.github.io/ja/ch03.html#23-%E3%82%BB%E3%82%AF%E3%82%B7%E3%83%A7%E3%83%B3%E6%A7%8B%E9%80%A0 """ import re from util import get_text if __name__ == '__main__': sep = "=" pat = re.compile(r'(==+)(.*)==+') text = get_text() for match in re.finditer(pat, text): print(match.group(0)) section_sep = match.group(1) print(len(section_sep))
#!/usr/bin/env python # -*- coding: utf-8 -*- import nltk import util # Doesn't work very well in french ... LANGUAGE = "english" text_file = util.get_text(LANGUAGE) text_str = text_file.read() text_str = nltk.word_tokenize(text_str, language=LANGUAGE) text_str = util.clean_text(text_str, LANGUAGE) text_tag = nltk.pos_tag(text_str) nltk_text = nltk.Text(text_str) print type(nltk_text) # CHUNKING sentence = nltk.word_tokenize("Bouteflika is the president of Algeria.") sentence = nltk.pos_tag(sentence) # grammar = "Actor: {<DT>?<JJS>*<NNP>+}" # jj adjectif # chunk= nltk.RegexpParser(grammar) # result = chunk.parse(text_tag)
def testSimple(self): etree = fromstring(ENTRY1) self.assertEqual(("text", "third"), util.get_text('title', etree)) self.assertEqual(("html", "<p>not much</p>"), util.get_text('summary', etree)) self.assertEqual(("xhtml", u'<p style="color:red" other=\'& and < and "\'>Some stuff</p><i><</i>.\n '), util.get_text('content', etree))
def parse_page(self, response): app_list = response.xpath('//div[@class="topic_before"]/a/@href').extract() cat = util.get_text(response, '//div[@class="l_box_title"]/h3/text()') for i in app_list: yield scrapy.Request('http://apk.91.com'+i, callback=self.parse_item, meta={'cat':cat})
def link(self, link, title, text): return f"[[{text}:{link}]]" puki = PukiwikiRenderer() md = mistune.Markdown(renderer=puki) def puki(filename, comments): with open(filename) as f: name = filename.split("/")[-1] comment = "\n".join(comments.get(name, "")) return md(f.read( )) + f"\n//{name} cmt_begin\n{comment}\n#comment\n//{name} cmt_end\n" print("getting text") text = util.get_text() print("merging") merged = "\n".join([ puki(filename, text.comments) for filename in sorted(glob("./entries/*"), reverse=True) ]) text.write(merged) print("writing text") response = text.set_text() print(response)