def process_stream(self): """ Processed the followers' 10 minute tweet streams :return: """ f = json.load(open('tmp_followers_stream.txt', 'r')) e = extract.Extract(f) self.followers_stream = e.process_stream()
def __init__(self): self.ex = extract.Extract() self.sc = scrape.Scrape() self.fm = format.Format() self.name = 1 # The cases get a number for a name starting with 1 self.allowed = [ "09", "08", "07", "06", "05", "04", "03", "02", "01", "00", "99", "98", "97", "96" ]
def parse_and_save_html(self): while 1: #signal.SIGINT信号对应的是ctrl+c,当收到这个信号时就调用stop函数 signal.signal(signal.SIGINT, self.stop) if self.redis.lindex('html'): content = self.redis.brpop('html') text = content['html'].decode('utf-8') extracts = extract.Extract(item=items.Item(), text=text, selector=parsel.Selector) #url的格式符合正则表达式,说明对应的response需要提取结构化数据 if self.patten.search(content['url'].decode('utf-8')): extracts.item_xpath('movie_name', '//h1/span[@property="v:itemreviewed"]/text()') extracts.item_xpath('movie_year', '//span[@property="v:initialReleaseDate"]/text()') extracts.item_xpath('movie_type', '//span[@property="v:genre"]/text()') extracts.item_xpath('movie_rate', '//strong[@class="ll rating_num"]/text()') item = extracts.get_item() result_item = (content['url'], item['movie_name'], item['movie_year'], item['movie_type'], item['movie_rate'],) cmd = """insert into item (url, movie_name, movie_year,movie_type, movie_rate) values (%s, %s, %s, %s, %s)""" self.db.query(cmd, result_item) else: extracts.link_xpath('//a/@href', r'/subject/[0-9]+/$|/tag/.*') url_list = extracts.get_links() #这里需要传输url的原因,是因为网页中提取出来的可能是相对网址 #在后续操作中需要将相对网址补充为绝对网址 result = json.dumps({'url': content['url'], 'url_list': url_list}) self.redis.rpush('unbloom_url_queue', result) html = zlib.compress(content['html']) headers = json.dumps(content['response_headers']).encode('utf-8') result1 = (content['url'], content['http_code'], headers, html,) cmd = """insert into html (url, http_code, response_headers, html) values (%s, %s, %s, %s)""" self.db.query(cmd, result1) self.logger.info('Save [%s] to MySQL', content['url'].encode('utf-8'))
# Import ETL process scripts import extract import transform import load import star_schema import preprocessing import model import evaluation # Set variables server = "localhost" database = "Fifa19" initial_load = True ### EXTRACT ### extractor = extract.Extract() my_data = extractor.query_data(server=server, database=database, table="fifa_19") df = my_data.copy() ### TRANSFORM ### transformer = transform.Transform() df = transformer.transform_data(df) ### STAR SCHEMA ### schimera = star_schema.Star_Schema() player_dim = schimera.apply_player_star_schema(df)
file = open(fname, "w") self.write(case, file, link) except IOError: print("Courld not find the file: ", fname) def write(self, case, file, link): """ write the case line by line in file """ file.write(link + "\n") [file.write(line + "\n") for line in case] if __name__ == "__main__": fm = Format() ex = extract.Extract() # Should write a cleanup case in the test folder case = [] ex.extract_case( "https://publications.parliament.uk/pa/ld199697/ldjudgmt/jd961121/smith01.htm", case) clean = fm.pretty_case(case) fm.save(clean, "1", "test") # Should catch error in file name and print it case = [] ex.extract_case( "https://publications.parliament.uk/pa/ld200809/ldjudgmt/jd090617/assom.htm", case) clean = fm.pretty_case(case)