def map_warc_files(self, _, line): """Mapper function to process each WARC file. Args: line: Each line is a path to a WARC gz file to be processed. Returns: Generator of (key, value) tuples. """ f = None # If we are on EC2 or running on a Hadoop cluster, pull files via S3 if self.options.runner in ['emr', 'hadoop']: # Connect to Amazon S3. s3 = boto3.resource('s3') obj = s3.Object('commoncrawl', line) # Hack to get the raw stream out of obj: # http://stackoverflow.com/questions/7624900/how-can-i-use-boto-to-stream-a-file-out-of-amazon-s3-to-rackspace-cloudfiles f = warc.WARCFile(fileobj=GzipStreamFile(obj.get()['Body']._raw_stream)) # If we are local, use files on the local file system else: line = Path.join(Path.abspath(Path.dirname(__file__)), line) print 'Loading local file {}'.format(line) f = warc.WARCFile(fileobj=gzip.open(line)) # For each WARC record: for i, record in enumerate(f): for key, value in self.process_warc_record(record): yield key, value self.increment_counter('commoncrawl', 'num-files', 1)
def open_warc_file(filename, from_commoncrawl=True): """ Opens a WARC file from local-data or S3 for Common Crawl files """ local_data_file = os.path.join(config["PATH_BACK"], 'local-data/%s' % filename) if not from_commoncrawl: filereader = open(filename, "rb") elif os.path.isfile(local_data_file): filereader = open(local_data_file, "rb") else: conn = boto.s3.connect_to_region( "us-east-1", anon=True, calling_format=boto.s3.connection.OrdinaryCallingFormat(), is_secure=False) pds = conn.get_bucket('aws-publicdatasets') filereader = Key(pds) filereader.key = filename if filename.endswith(".warc"): return warc.WARCFile(fileobj=filereader) else: # TODO: investigate how we could use cloudflare's zlib return warc.WARCFile(fileobj=GzipStreamFile(filereader))
def _warc_reader_from_file(self, filereader, filepath): """ Creates a WARC record iterator from a file reader """ if filepath.endswith(".warc"): return warc.WARCFile(fileobj=filereader) else: # TODO: investigate how we could use cloudflare's zlib return warc.WARCFile(fileobj=GzipStreamFile(filereader))
def mapper(self, _, line): """ The Map of MapReduce If you're using Hadoop or EMR, it pulls the CommonCrawl files from S3, otherwise it pulls from the local filesystem. Dispatches each file to `process_record`. """ # If we're on EC2 or running on a Hadoop cluster, pull files via S3 if self.options.runner in ['emr', 'hadoop']: # Connect to Amazon S3 using anonymous credentials boto_config = botocore.client.Config( signature_version=botocore.UNSIGNED, read_timeout=180, retries={'max_attempts': 20}) s3client = boto3.client('s3', config=boto_config) # Verify bucket try: s3client.head_bucket(Bucket='commoncrawl') except botocore.exceptions.ClientError as exception: LOG.error('Failed to access bucket "commoncrawl": %s', exception) return # Check whether WARC/WAT/WET input exists try: s3client.head_object(Bucket='commoncrawl', Key=line) except botocore.client.ClientError as exception: LOG.error('Input not found: %s', line) return # Start a connection to one of the WARC/WAT/WET files LOG.info('Loading s3://commoncrawl/%s', line) try: temp = TemporaryFile(mode='w+b', dir=self.options.s3_local_temp_dir) s3client.download_fileobj('commoncrawl', line, temp) except botocore.client.ClientError as exception: LOG.error('Failed to download %s: %s', line, exception) return temp.seek(0) try: #ccfile = warc.WARCFile(fileobj=(GzipStreamFile(temp))) ccfile = warc.WARCFile(fileobj=(gzip.open(temp))) except Exception as exception: LOG.error('Failed to open %s at %s: %s', temp, line, exception) return # If we're local, use files on the local file system else: line = Path.join(Path.abspath(Path.dirname(__file__)), line) LOG.info('Loading local file %s', line) try: ccfile = warc.WARCFile(fileobj=gzip.open(line)) except Exception as exception: LOG.error('Failed to open %s: %s', line, exception) return for _i, record in enumerate(ccfile): for key, value in self.process_record(record): yield key, value self.increment_counter('commoncrawl', 'processed_records', 1)
def get_records(id_, iterator): conn = boto.connect_s3(anon=True, host='s3.amazonaws.com') bucket = conn.get_bucket('commoncrawl') for uri in iterator: key_ = Key(bucket, uri) _file = warc.WARCFile(fileobj=GzipStreamFile(key_)) for record in _file: if record['Content-Type'] == 'application/json': record = json.loads(record.payload.read()) try: def cc_filter(x): return "creativecommons.org" in x['url'] cc_links = filter( cc_filter, list(record['Envelope']['Payload-Metadata'] ['HTTP-Response-Metadata']['HTML-Metadata'] ['Links'])) if len(cc_links) > 0: yield record except KeyError: pass
def warc_to_zip(): warcfile = sys.argv[1] zipout = sys.argv[2] file = zipfile.ZipFile(zipout, "w", zipfile.ZIP_DEFLATED, True) f = warc.WARCFile(warcfile, "rb") for record in f: print "------------" for key in record.header.keys(): print key, record.header[key] if record.header.has_key('warc-target-uri'): u = urlparse(record['WARC-Target-URI']) name = "{}/{}/{}".format(u.scheme, u.netloc, u.path) if record['content-type'] == "application/http;msgtype=response": r = httpparse(record.payload) file.writestr(name, r.read()) elif record['content-type'] == "application/http;msgtype=request": print "payload:", record.payload print "Skipping request record", record['WARC-Target-URI'] file.writestr("{}-request".format(name), record.payload.read()) else: print "Skipping record", record['WARC-Target-URI'] file.writestr(name, record.payload.read()) file.close()
def mapper(self, _, line): f = None """ if self.options.runner in ['inline']: print self.options.runner + "lol" print 'Loading local file {}'.format(line) f = warc.WARCFile(fileobj=gzip.open(line)) else: """ conn = boto.connect_s3(anon=True) pds = conn.get_bucket('aws-publicdatasets') k = Key(pds, line) f = warc.WARCFile(fileobj=GzipStreamFile(k)) for i, record in enumerate(f): if record['Content-Type'] == 'application/http; msgtype=response': payload = record.payload.read() headers, body = payload.split('\r\n\r\n', 1) email = "email" p = re.compile(EMAIL_REGEX) emails = [i for i in re.findall(p, body) if len(i) < 50] emails = set(emails) domain = urlparse(record.url).netloc for email in emails: yield { "url": record.url, "date": record.date, "email": email, "domain": domain }, 1
def process_files(patt=PATT): files = list(data_dir.glob(patt)) host_counter, server_counter = Counter(), Counter() for file in files: ccfile = warc.WARCFile(fileobj=gzip.open(file)) for i, record in enumerate(ccfile): if record['Content-Type'] != 'application/json': continue payload = record.payload.read() data = json.loads(payload) if data['Envelope']['WARC-Header-Metadata'][ 'WARC-Type'] != 'response': continue url = data["Envelope"]["WARC-Header-Metadata"].get( "WARC-Target-URI") if url: host = urllib.parse.urlparse(url).netloc.lower() host_counter.update([host]) server = data['Envelope']['Payload-Metadata'][ 'HTTP-Response-Metadata']['Headers'].get('Server') if server: server_counter.update([server]) return host_counter, server_counter
def mapWat(self, _, line): ''' Takes partial WARC paths and produces (hostname, {links}) pairs ''' if self.options.localsource: # Stream data from local file # this lets us use pre-downloaded *.gz files for testing rather than # hammering the amazon servers. fpath = os.path.abspath( os.path.join(self.options.localsource, line)) print('Loading local file: ' + fpath) rawstream = open(fpath, 'rb') else: # Stream data from common crawl servers conn = boto.connect_s3(anon=True, host='s3.amazonaws.com') pds = conn.get_bucket('commoncrawl') rawstream = boto.s3.key.Key(pds, line) # iterate through records in warc.wat.gz file warcstream = warc.WARCFile(fileobj=GzipStreamFile(rawstream)) for i, record in enumerate(warcstream): if record['Content-Type'] == 'application/json': payload = record.payload.read() jsonPayload = json.loads(payload) hostlinks = self.watHostLinks(jsonPayload) if hostlinks: yield hostlinks if self.options.localsource and i % 10000 == 0: print('Record %5dk' % (i / 1000)) self.increment_counter('commoncrawl', 'processed_records', 1) rawstream.close()
def mapper(self, _, line): f = None #if self.options.runner in ['inline']: # print self.options.runner + "lol" # print 'Loading local file {}'.format(line) # f = warc.WARCFile(fileobj=gzip.open(line)) #else: conn = boto.connect_s3(anon=True) pds = conn.get_bucket('aws-publicdatasets') k = Key(pds, line) f = warc.WARCFile(fileobj=GzipStreamFile(k)) for i, record in enumerate(f): if record['Content-Type'] == 'application/http; msgtype=response': payload = record.payload.read() headers, body = payload.split('\r\n\r\n', 1) data = [] #data = data + Detector().check_headers(headers) data = data + Detector().check_script(body) data = data + Detector().check_html(body) data = { "tech": data, "url": record.url, "date": record.date, "domain": urlparse(record.url).netloc } yield data, 1
def create(self, filename, fileobj=None, operator=None): """ :rtype: warc.WARCFile """ assert useragent.POLICY is not None if fileobj is None: fileobj = io.BytesIO() self.fileobj = fileobj self.warc = warc.WARCFile(fileobj=fileobj) header = warc.WARCHeader({ "WARC-Type": "warcinfo", "WARC-Filename": filename, }, defaults=True) body = [ b"software: owlbot/"+bytes(version.STR, "ascii"), b"format: WARC File Format 1.0", # policy from .OWLBOT_POLICY or os.environ["OWLBOT_POLICY"] b"robots: " + bytes(useragent.POLICY, "ascii"), ] if operator is not None: body.append(b"operator: " + operator.encode("utf-8")) self.warc.write_record( warc.WARCRecord(header, payload=b"\r\n".join(body)) )
def read_warc(path='./data/sample.warc.gz'): # Beautiful soup HTML to text with gzip.open(path, mode='rb') as gzf: #cleantexts = [] #doc_ids = [] text_and_ids = [] for i, record in enumerate(warc.WARCFile(fileobj=gzf)): if i == 0: continue # cleantexts.append(BeautifulSoup(record.payload.read(), 'lxml').text) soup = BeautifulSoup(record.payload.read(), 'lxml') for script in soup(["script", "style"]): script.extract() # rip it out text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) text = remove_html_tags(text) #cleantexts.append(text) #doc_ids.append(record.header.get('WARC-TREC-ID')) doc_id = record.header.get('WARC-TREC-ID') text_and_ids.append((text, doc_id)) return text_and_ids #cleantexts,doc_ids
def clean_warc(input): text = v2.extract(input) warc_content = warc.WARCFile(fileobj=StringIO.StringIO(input)) for record in warc_content: url, date = record['WARC-Target-URI'], record['WARC-Date'] return '%s,%s\n%s' % (url, date, text)
def mapper(self, _, line): f = None ## If we're on EC2 or running on a Hadoop cluster, pull files via S3 if self.options.runner in ['emr', 'hadoop']: # Connect to Amazon S3 using anonymous credentials conn = boto.connect_s3(anon=True) pds = conn.get_bucket('aws-publicdatasets') # Start a connection to one of the WARC files k = Key(pds, line) f = warc.WARCFile(fileobj=GzipStreamFile(k)) ## If we're local, use files on the local file system else: print 'Loading local file {}'.format(line) f = warc.WARCFile(fileobj=gzip.open(line)) ### for i, record in enumerate(f): for key, value in self.process_record(record): yield key, value self.increment_counter('commoncrawl', 'processed_records', 1)
def mapper(self, _, line): """ The map will download the file from commoncrawl, parse the file into multiple records, and process each record """ self.start_time = time.time() # Connect to Amazon S3 using anonymous credentials boto_config = botocore.client.Config( signature_version=botocore.UNSIGNED, read_timeout=180, retries={'max_attempts': 20}) s3client = boto3.client('s3', config=boto_config) # Check bucket existence try: s3client.head_bucket(Bucket='commoncrawl') except botocore.exceptions.ClientError as exception: LOG.error('Failed to access bucket "commoncrawl": %s', exception) return # Check if the input exists try: s3client.head_object(Bucket='commoncrawl', Key=line) except botocore.client.ClientError as exception: LOG.error('Input not found: %s', line) return # Download input sys.stderr.write("Downloading s3://commoncrawl/{}\n".format(line)) sys.stderr.write( time.strftime( "Download [START]. Distance from initial time: %Hh:%Mm:%Ss\n", time.gmtime(time.time() - self.start_time))) try: temp = TemporaryFile(mode='w+b', dir=self.options.s3_local_temp_dir) s3client.download_fileobj('commoncrawl', line, temp) except botocore.client.ClientError as exception: LOG.error('Failed to download %s: %s', line, exception) return sys.stderr.write( time.strftime( "Download [FINISHED]. Distance from initial time: %Hh:%Mm:%Ss\n", time.gmtime(time.time() - self.start_time))) temp.seek(0) ccfile = warc.WARCFile(fileobj=(GzipStreamFile(temp))) sys.stderr.write('Attempting MapReduce Job......\n') sys.stderr.write( time.strftime( "Processing [START]. Distance from initial time: %Hh:%Mm:%Ss\n", time.gmtime(time.time() - self.start_time))) for _i, record in enumerate(ccfile): for key, value in self.process_record(record): yield key, value self.increment_counter('commoncrawl', 'processed_records', 1) sys.stderr.write( time.strftime( "Processing [FINISHED]. Distance from initial time: %Hh:%Mm:%Ss\n", time.gmtime(time.time() - self.start_time)))
def __init__(self, f): if not os.path.isfile('records.json'): print('No records.json file was found.') print('We need the records.json for deduplication!') if 'n' in raw_input('Continue? [y/n]').lower(): sys.exit(1) else: self.load_records() self.input_filename = f self.input_file = warc.WARCFile(self.input_filename) self.input_file_size = os.path.getsize(self.input_filename) self.output_filename = self.input_filename[:-8] \ + '-deduplicated.warc.gz' self.output_file = warc.WARCFile(self.output_filename, 'w') self.output_log_filename = self.input_filename[:-8] \ + '-deduplicated.log' self.output_log = []
def double_check(cls, f): input_file = warc.WARCFile(f) input_file_size = os.path.getsize(f) input_file_records = 0 output_filename = f[:-8] + '-deduplicated.warc.gz' output_file = warc.WARCFile(output_filename) output_file_size = os.path.getsize(output_filename) output_file_records = 0 while input_file_size > input_file.tell(): for record in input_file: input_file_records += 1 while output_file_size > output_file.tell(): for record in output_file: output_file_records += 1 input_file.close() output_file.close() return input_file_records == output_file_records - 1
def get_partial_warc_file(url, num_bytes=1024 * 10): """ Download the first part of a WARC file and return a warc.WARCFile instance. url: the url of a gzipped WARC file num_bytes: the number of bytes to download. Default is 10KB return: warc.WARCFile instance """ with closing(requests.get(url, stream=True)) as r: buf = StringIO(r.raw.read(num_bytes)) return warc.WARCFile(fileobj=buf, compress=True)
def process_record(self, record): f = warc.WARCFile(fileobj=gzip.open(filepath)) vocab = defaultdict(int) nbwords = 0 for i, record in enumerate(f): devnull = open(os.devnull,"w") if record['Content-Type'] != 'text/plain': continue page = record.payload.read() v, n = learn_vocab_from_train_iter(page) yield v, n self.increment_counter('commoncrawl', 'processed_pages', 1)
def create(file_path, i): entries = [] r = redis.StrictRedis(host='localhost', port=6379, db=0) with gzip.open(file_path, mode='rb') as gzf: for record in warc.WARCFile(fileobj=gzf): url = record['WARC-Target-URI'].strip() html = record.payload.read() soup = BeautifulSoup(html, 'lxml') links = [link.get('href') for link in soup.find_all('a')] row = entityfeatureextractor.extract_row(domain, url, html) r.rpush(url, row) r.rpush(url, links)
def process_paths(self, id_, paths): ''' connect to s3 and get the data ''' conn = boto.connect_s3(anon=True, host='s3.amazonaws.com') bucket = conn.get_bucket('commoncrawl') for uri in paths: key_ = Key(bucket, uri) archive_iterator = warc.WARCFile(fileobj=GzipStreamFile(key_)) for record in archive_iterator: for res in self.process_record(record): yield res
def parse_archive(self, line): # Connect to Amazon S3 using anonymous credentials conn = boto.connect_s3(anon=True) pds = conn.get_bucket('aws-publicdatasets') # Start a connection to one of the WARC files k = Key(pds, line) f = warc.WARCFile(fileobj=GzipStreamFile(k)) for record in f: if record['Content-Type'] != 'application/http; msgtype=response': continue self.doc_q.put(record.payload.read()) self.count += 1
def mapper(self, _, line): f = None ## If we're on EC2 or running on a Hadoop cluster, pull files via S3 if line.startswith("s3://"): print('Downloading ...',file=sys.stderr) key = None # Connect to Amazon S3 using anonymous credentials conn = boto.connect_s3(anon=True) if line.startswith("s3://"): pathStart = line.index('/',5) bucketName = line[5:pathStart] keyPath = line[pathStart+1:] print("Bucket: "+bucketName,file=sys.stderr) print("Key: "+keyPath,file=sys.stderr) bucket = conn.get_bucket(bucketName) key = Key(bucket,keyPath) else: print("Bucket: aws-publicdatasets",file=sys.stderr) print("Key: "+line,file=sys.stderr) bucket = conn.get_bucket("aws-publicdatasets") key = Key(bucket,line) # Start a connection to one of the WARC files f = warc.WARCFile(fileobj=GzipStreamFile(key)) ## If we're local, use files on the local file system else: if line.startswith("file:///"): line = line[7:] print("Local: {}".format(line),file=sys.stderr) f = warc.WARCFile(fileobj=gzip.open(line)) ### for i, record in enumerate(f): for key, value in self.process_record(record): yield key, value self.increment_counter('commoncrawl', 'processed_records', 1)
def __init__(self, filename=None, old_style=False): # podporujeme i cteni z archivu, ne? Hlavne vsechno cist po bajtech if (filename is None): self._file = sys.stdin.buffer elif (filename.endswith('.xz')): self._file = lzma.open(filename, 'rb') elif (filename.endswith('.gz')): self._file = gzip.open(filename, 'rb') else: self._file = open(filename, 'rb') # Tak tohle jsem musel musel vycist ze samotnych zdrojaku knihovny warc. # Dokumentaci maji fakt dost spatnou. self._warc = warc.WARCFile(fileobj=self._file) self._old_style = old_style
def parse_wet_file(): # TODO: copy WET file from HDFS to tmp path gzip_fobj = gzip.open(wet_file, "r") warc_fobj = warc.WARCFile(fileobj=gzip_fobj, compress=False) while True: try: record = warc_fobj.read_record() except: continue if not record: break # TODO: got a warc record in record, parse it return
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description=__name__) parser.add_argument('--source-url', '-u', help='Remote URL to read input WARC file from.') parser.add_argument('--source-file', '-f', help='Local path to read input WARC file from.') args = parser.parse_args() # Validate argumentsif(args.command in ['setup-pool'] and args.pool_directory == None): if not (args.source_file or args.source_url): parser.error( "--source-file or --source-url argument must be provided.") if args.source_file is not None: source_string = args.source_file cf = open(args.source_file) elif args.source_url is not None: source_string = args.source_url # Open a connection pool http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) # Open a streaming connection to the specified URL cf = http.request('GET', args.source_url, preload_content=False) # Wrap the filestream in a streamable unzipper f = warc.WARCFile(fileobj=GzipStreamFile(cf)) warc_records = 0 warc_responses = 0 readable_pages = 0 report_interval = 100 start_time = arrow.utcnow() for record in f: if record['WARC-Type'] == 'response': warc_responses = warc_responses + 1 end_time = arrow.utcnow() elapsed_time = end_time - start_time print("{} response records in file {} ()".format(warc_responses, source_string, elapsed_time))
def process_document(self, doc): if doc.status == 200: self.concurrency_lock.acquire() try: #print base64.b64encode(doc.text)+"\t"+doc.url+"\t"+str(time.time()) warc_record = warc.WARCRecord(payload=doc.text,headers={"WARC-Target-URI":doc.url}) f = warc.WARCFile(fileobj=sys.stdout.buffer) f.write_record(warc_record) self.crawlsize+=sys.getsizeof(doc.text)/1000000.0 if self.sizelimit != None and self.crawlsize > self.sizelimit: self.interrupt=True self.save_status() if self.timelimit != None and time.time()-self.crawlstarts > self.timelimit: self.interrupt=True self.save_status() finally: self.concurrency_lock.release() else: pass
def analyze_warc_file(): host = [] server = [] f = warc.WARCFile(FILE, "rb") for num, record in enumerate(f, start=1): line = None record.payload.readline() line = record.payload.readline() print(str(line)) if 'Host' in str(line): host.append( re.search(r':(.*)', str(line)).group(1).strip().replace("\\r\\n'", '')) if 'Server' in str(line): server.append( re.search(r':(.*)', str(line)).group(1).strip().replace("\\r\\n'", '')) print(server) print(host) return host, server
def parsefile(filename): f = warc.WARCFile(filename, 'r') invindex = {} pagetablekey = 1 + len(PAGETABLE) for record in f: url = record.header.get('warc-target-uri', None) if not url: continue words = record.payload.read().split() PAGETABLE[pagetablekey] = (url, len(words)) for w in set(words): #if regex.match(w.decode('utf8')): if w not in invindex: invindex[w] = [pagetablekey] else: invindex[w].append(pagetablekey) pagetablekey += 1 f.close() return invindex
def process_record(self, record): if record['WARC-Type'] != 'response': return # The HTTP response is defined by a specification: first part is headers # (metadata) and then following two CRLFs (newlines) has the response payload = record.payload.read() http_headers, body = payload.split('\r\n\r\n', 1) if 'Content-Type: text/html' in http_headers and body.strip(): if ENDPOINT_RE.search(http_headers) or INDIEWEB_RE.search(body): warcstr = StringIO() warcfile = warc.WARCFile(fileobj=warcstr, mode='w') warcfile.write_record( warc.WARCRecord(payload=payload, header=record.header)) warcbuf = base64.b64encode(warcstr.getvalue()) warcfile.close() domain = urlparse.urlparse( record['WARC-Target-URI']).netloc.lower() # domain = headers['Host'] yield domain, warcbuf