예제 #1
0
 def process_stream(self):
     """
     Processed the followers' 10 minute tweet streams
     :return:
     """
     f = json.load(open('tmp_followers_stream.txt', 'r'))
     e = extract.Extract(f)
     self.followers_stream = e.process_stream()
예제 #2
0
 def __init__(self):
     self.ex = extract.Extract()
     self.sc = scrape.Scrape()
     self.fm = format.Format()
     self.name = 1  # The cases get a number for a name starting with 1
     self.allowed = [
         "09", "08", "07", "06", "05", "04", "03", "02", "01", "00", "99",
         "98", "97", "96"
     ]
예제 #3
0
	def parse_and_save_html(self):
		while 1:
			#signal.SIGINT信号对应的是ctrl+c,当收到这个信号时就调用stop函数
			signal.signal(signal.SIGINT, self.stop)
			if self.redis.lindex('html'):
				content = self.redis.brpop('html')
				text = content['html'].decode('utf-8')
				extracts = extract.Extract(item=items.Item(), text=text, selector=parsel.Selector)
				#url的格式符合正则表达式,说明对应的response需要提取结构化数据
				if self.patten.search(content['url'].decode('utf-8')):	
					extracts.item_xpath('movie_name', '//h1/span[@property="v:itemreviewed"]/text()')
					extracts.item_xpath('movie_year', '//span[@property="v:initialReleaseDate"]/text()')
					extracts.item_xpath('movie_type', '//span[@property="v:genre"]/text()')
					extracts.item_xpath('movie_rate', '//strong[@class="ll rating_num"]/text()')
					item = extracts.get_item()
					result_item = (content['url'], 
							   item['movie_name'],
							   item['movie_year'],
							   item['movie_type'],
							   item['movie_rate'],)

					cmd = """insert into item (url, movie_name, movie_year,movie_type, movie_rate) 
						  values (%s, %s, %s, %s, %s)"""
					self.db.query(cmd, result_item)
				else:
					extracts.link_xpath('//a/@href', r'/subject/[0-9]+/$|/tag/.*')
					url_list = extracts.get_links()
					#这里需要传输url的原因,是因为网页中提取出来的可能是相对网址
					#在后续操作中需要将相对网址补充为绝对网址
					result = json.dumps({'url': content['url'], 'url_list': url_list})
					self.redis.rpush('unbloom_url_queue', result)

				html = zlib.compress(content['html'])
				headers = json.dumps(content['response_headers']).encode('utf-8')

				result1 = (content['url'], content['http_code'], headers, html,)
				cmd = """insert into html (url, http_code, response_headers, html) 
					  values (%s, %s, %s, %s)"""
				self.db.query(cmd, result1)
				self.logger.info('Save [%s] to MySQL', content['url'].encode('utf-8'))
예제 #4
0
# Import ETL process scripts
import extract
import transform
import load
import star_schema
import preprocessing
import model
import evaluation

# Set variables
server = "localhost"
database = "Fifa19"
initial_load = True

### EXTRACT ###
extractor = extract.Extract()

my_data = extractor.query_data(server=server,
                               database=database,
                               table="fifa_19")
df = my_data.copy()

### TRANSFORM ###
transformer = transform.Transform()

df = transformer.transform_data(df)

### STAR SCHEMA ###
schimera = star_schema.Star_Schema()

player_dim = schimera.apply_player_star_schema(df)
예제 #5
0
            file = open(fname, "w")
            self.write(case, file, link)
        except IOError:
            print("Courld not find the file: ", fname)

    def write(self, case, file, link):
        """
        write the case line by line in file
        """
        file.write(link + "\n")
        [file.write(line + "\n") for line in case]


if __name__ == "__main__":
    fm = Format()
    ex = extract.Extract()

    # Should write a cleanup case in the test folder
    case = []
    ex.extract_case(
        "https://publications.parliament.uk/pa/ld199697/ldjudgmt/jd961121/smith01.htm",
        case)
    clean = fm.pretty_case(case)
    fm.save(clean, "1", "test")

    # Should catch error in file name and print it
    case = []
    ex.extract_case(
        "https://publications.parliament.uk/pa/ld200809/ldjudgmt/jd090617/assom.htm",
        case)
    clean = fm.pretty_case(case)