def mapper(self, _, line):
        f = None
        #if self.options.runner in ['inline']:
        #  print self.options.runner + "lol"
        #  print 'Loading local file {}'.format(line)
        #  f = warc.WARCFile(fileobj=gzip.open(line))
        #else:
        conn = boto.connect_s3(anon=True)
        pds = conn.get_bucket('aws-publicdatasets')
        k = Key(pds, line)
        f = warc.WARCFile(fileobj=GzipStreamFile(k))

        for i, record in enumerate(f):
            if record['Content-Type'] == 'application/http; msgtype=response':
                payload = record.payload.read()
                headers, body = payload.split('\r\n\r\n', 1)
                data = []
                #data = data + Detector().check_headers(headers)
                data = data + Detector().check_script(body)
                data = data + Detector().check_html(body)
                data = {
                    "tech": data,
                    "url": record.url,
                    "date": record.date,
                    "domain": urlparse(record.url).netloc
                }
                yield data, 1
예제 #2
0
    def mapper(self, _, line):
        f = None
        """
    if self.options.runner in ['inline']:
      print self.options.runner + "lol"
      print 'Loading local file {}'.format(line)
      f = warc.WARCFile(fileobj=gzip.open(line))
    else:
    """
        conn = boto.connect_s3(anon=True)
        pds = conn.get_bucket('aws-publicdatasets')
        k = Key(pds, line)
        f = warc.WARCFile(fileobj=GzipStreamFile(k))

        for i, record in enumerate(f):
            if record['Content-Type'] == 'application/http; msgtype=response':
                payload = record.payload.read()
                headers, body = payload.split('\r\n\r\n', 1)
                email = "email"
                p = re.compile(EMAIL_REGEX)
                emails = [i for i in re.findall(p, body) if len(i) < 50]
                emails = set(emails)
                domain = urlparse(record.url).netloc
                for email in emails:
                    yield {
                        "url": record.url,
                        "date": record.date,
                        "email": email,
                        "domain": domain
                    }, 1
예제 #3
0
    def open_dump(self):
        """ Returns a file-like object for the dump """

        if config["TESTDATA"] == "1":
            return open(self.dump_testdata, "rb")
        else:
            hdr = {
                'User-Agent': 'Mozilla/5.0 (compatible; commonBot; +https://about.commonsearch.org)'
            }

            req = urllib2.Request(self.dump_url, headers=hdr)

            f = urllib2.urlopen(req)

            if self.dump_compression == "zip":

                file_name = self.dump_compression_params[0]

                # TODO: is there a more efficient way of doing this? the file object passed to ZipFile
                # need to support .seek()
                zfile = zipfile.ZipFile(StringIO.StringIO(f.read()))
                return StringIO.StringIO(zfile.read(file_name))

            elif self.dump_compression == "gz":

                f.__dict__["closed"] = False  # Hack for GzipStreamFile
                return GzipStreamFile(f)

            else:
                return f
예제 #4
0
    def mapWat(self, _, line):
        ''' Takes partial WARC paths and produces (hostname, {links}) pairs '''
        if self.options.localsource:
            # Stream data from local file
            # this lets us use pre-downloaded *.gz files for testing rather than
            # hammering the amazon servers.
            fpath = os.path.abspath(
                os.path.join(self.options.localsource, line))
            print('Loading local file: ' + fpath)
            rawstream = open(fpath, 'rb')
        else:
            # Stream data from common crawl servers
            conn = boto.connect_s3(anon=True, host='s3.amazonaws.com')
            pds = conn.get_bucket('commoncrawl')
            rawstream = boto.s3.key.Key(pds, line)

        # iterate through records in warc.wat.gz file
        warcstream = warc.WARCFile(fileobj=GzipStreamFile(rawstream))
        for i, record in enumerate(warcstream):
            if record['Content-Type'] == 'application/json':
                payload = record.payload.read()
                jsonPayload = json.loads(payload)
                hostlinks = self.watHostLinks(jsonPayload)
                if hostlinks: yield hostlinks
            if self.options.localsource and i % 10000 == 0:
                print('Record %5dk' % (i / 1000))
            self.increment_counter('commoncrawl', 'processed_records', 1)
        rawstream.close()
예제 #5
0
def open_warc_file(filename, from_commoncrawl=True):
    """ Opens a WARC file from local-data or S3 for Common Crawl files """

    local_data_file = os.path.join(config["PATH_BACK"],
                                   'local-data/%s' % filename)

    if not from_commoncrawl:
        filereader = open(filename, "rb")
    elif os.path.isfile(local_data_file):
        filereader = open(local_data_file, "rb")
    else:
        conn = boto.s3.connect_to_region(
            "us-east-1",
            anon=True,
            calling_format=boto.s3.connection.OrdinaryCallingFormat(),
            is_secure=False)

        pds = conn.get_bucket('aws-publicdatasets')
        filereader = Key(pds)
        filereader.key = filename

    if filename.endswith(".warc"):
        return warc.WARCFile(fileobj=filereader)
    else:
        # TODO: investigate how we could use cloudflare's zlib
        return warc.WARCFile(fileobj=GzipStreamFile(filereader))
예제 #6
0
def get_records(id_, iterator):
    conn = boto.connect_s3(anon=True, host='s3.amazonaws.com')
    bucket = conn.get_bucket('commoncrawl')

    for uri in iterator:
        key_ = Key(bucket, uri)
        _file = warc.WARCFile(fileobj=GzipStreamFile(key_))

        for record in _file:
            if record['Content-Type'] == 'application/json':
                record = json.loads(record.payload.read())
                try:

                    def cc_filter(x):
                        return "creativecommons.org" in x['url']

                    cc_links = filter(
                        cc_filter,
                        list(record['Envelope']['Payload-Metadata']
                             ['HTTP-Response-Metadata']['HTML-Metadata']
                             ['Links']))
                    if len(cc_links) > 0:
                        yield record
                except KeyError:
                    pass
예제 #7
0
  def map_warc_files(self, _, line):
    """Mapper function to process each WARC file.

    Args:
      line: Each line is a path to a WARC gz file to be processed.

    Returns:
      Generator of (key, value) tuples.
    """
    f = None
    # If we are on EC2 or running on a Hadoop cluster, pull files via S3
    if self.options.runner in ['emr', 'hadoop']:
      # Connect to Amazon S3.
      s3 = boto3.resource('s3')
      obj = s3.Object('commoncrawl', line)
      # Hack to get the raw stream out of obj:
      # http://stackoverflow.com/questions/7624900/how-can-i-use-boto-to-stream-a-file-out-of-amazon-s3-to-rackspace-cloudfiles
      f = warc.WARCFile(fileobj=GzipStreamFile(obj.get()['Body']._raw_stream))
    # If we are local, use files on the local file system
    else:
      line = Path.join(Path.abspath(Path.dirname(__file__)), line)
      print 'Loading local file {}'.format(line)
      f = warc.WARCFile(fileobj=gzip.open(line))

    # For each WARC record:
    for i, record in enumerate(f):
      for key, value in self.process_warc_record(record):
        yield key, value
    self.increment_counter('commoncrawl', 'num-files', 1)
예제 #8
0
    def _warc_reader_from_file(self, filereader, filepath):
        """ Creates a WARC record iterator from a file reader """

        if filepath.endswith(".warc"):
            return warc.WARCFile(fileobj=filereader)
        else:
            # TODO: investigate how we could use cloudflare's zlib
            return warc.WARCFile(fileobj=GzipStreamFile(filereader))
예제 #9
0
 def mapper(self, _, line):
     """
     The map will download the file from commoncrawl, parse the file into multiple records, and process each record
     """
     self.start_time = time.time()
     # Connect to Amazon S3 using anonymous credentials
     boto_config = botocore.client.Config(
         signature_version=botocore.UNSIGNED,
         read_timeout=180,
         retries={'max_attempts': 20})
     s3client = boto3.client('s3', config=boto_config)
     # Check bucket existence
     try:
         s3client.head_bucket(Bucket='commoncrawl')
     except botocore.exceptions.ClientError as exception:
         LOG.error('Failed to access bucket "commoncrawl": %s', exception)
         return
     # Check if the input exists
     try:
         s3client.head_object(Bucket='commoncrawl', Key=line)
     except botocore.client.ClientError as exception:
         LOG.error('Input not found: %s', line)
         return
     # Download input
     sys.stderr.write("Downloading s3://commoncrawl/{}\n".format(line))
     sys.stderr.write(
         time.strftime(
             "Download [START]. Distance from initial time: %Hh:%Mm:%Ss\n",
             time.gmtime(time.time() - self.start_time)))
     try:
         temp = TemporaryFile(mode='w+b',
                              dir=self.options.s3_local_temp_dir)
         s3client.download_fileobj('commoncrawl', line, temp)
     except botocore.client.ClientError as exception:
         LOG.error('Failed to download %s: %s', line, exception)
         return
     sys.stderr.write(
         time.strftime(
             "Download [FINISHED]. Distance from initial time: %Hh:%Mm:%Ss\n",
             time.gmtime(time.time() - self.start_time)))
     temp.seek(0)
     ccfile = warc.WARCFile(fileobj=(GzipStreamFile(temp)))
     sys.stderr.write('Attempting MapReduce Job......\n')
     sys.stderr.write(
         time.strftime(
             "Processing [START]. Distance from initial time: %Hh:%Mm:%Ss\n",
             time.gmtime(time.time() - self.start_time)))
     for _i, record in enumerate(ccfile):
         for key, value in self.process_record(record):
             yield key, value
         self.increment_counter('commoncrawl', 'processed_records', 1)
     sys.stderr.write(
         time.strftime(
             "Processing [FINISHED]. Distance from initial time: %Hh:%Mm:%Ss\n",
             time.gmtime(time.time() - self.start_time)))
예제 #10
0
파일: mrcc.py 프로젝트: dcnoye/cc-mrjob
    def mapper(self, _, line):
        """
        The Map of MapReduce
        If you're using Hadoop or EMR, it pulls the Common Crawl files from S3,
        otherwise it pulls from the local filesystem. Dispatches each file to
        `process_record`.
        """
        # If we're on EC2 or running on a Hadoop cluster, pull files via S3
        if self.options.runner in ['emr', 'hadoop']:
            # Connect to Amazon S3 using anonymous credentials
            boto_config = botocore.client.Config(
                signature_version=botocore.UNSIGNED,
                read_timeout=180,
                retries={'max_attempts': 20})
            if self.options.bucket != 'commoncrawl':
                # use defaults if data is read from a custom bucket
                boto_config = botocore.client.Config()
            s3client = boto3.client('s3', config=boto_config)
            # Verify bucket
            try:
                s3client.head_bucket(Bucket=self.options.bucket)
            except botocore.exceptions.ClientError as exception:
                LOG.error('Failed to access bucket "%s": %s',
                          self.options.bucket, exception)
                return
            # Check whether WARC/WAT/WET input exists
            try:
                s3client.head_object(Bucket=self.options.bucket, Key=line)
            except botocore.client.ClientError as exception:
                LOG.error('Input not found: %s', line)
                return
            # Start a connection to one of the WARC/WAT/WET files
            LOG.info('Loading s3://%s/%s', self.options.bucket, line)
            try:
                temp = TemporaryFile(mode='w+b',
                                     dir=self.options.s3_local_temp_dir)
                s3client.download_fileobj(self.options.bucket, line, temp)
            except botocore.client.ClientError as exception:
                LOG.error('Failed to download %s: %s', line, exception)
                return
            temp.seek(0)
            ccfile = warc.WARCFile(fileobj=(GzipStreamFile(temp)))
        # If we're local, use files on the local file system
        else:
            line = Path.join(Path.abspath(Path.dirname(__file__)), line)
            LOG.info('Loading local file %s', line)
            ccfile = warc.WARCFile(fileobj=gzip.open(line))

        for _i, record in enumerate(ccfile):
            for key, value in self.process_record(record):
                yield key, value
            self.increment_counter('commoncrawl', 'processed_records', 1)
예제 #11
0
    def process_paths(self, id_, paths):
        '''
        connect to s3 and get the data
        '''

        conn = boto.connect_s3(anon=True, host='s3.amazonaws.com')
        bucket = conn.get_bucket('commoncrawl')

        for uri in paths:
            key_ = Key(bucket, uri)
            archive_iterator = warc.WARCFile(fileobj=GzipStreamFile(key_))
            for record in archive_iterator:
                for res in self.process_record(record):
                    yield res
예제 #12
0
파일: crawler.py 프로젝트: admhpr/python
    def parse_archive(self, line):
        # Connect to Amazon S3 using anonymous credentials
        conn = boto.connect_s3(anon=True)
        pds = conn.get_bucket('aws-publicdatasets')

        # Start a connection to one of the WARC files
        k = Key(pds, line)
        f = warc.WARCFile(fileobj=GzipStreamFile(k))

        for record in f:
            if record['Content-Type'] != 'application/http; msgtype=response':
                continue
            self.doc_q.put(record.payload.read())
            self.count += 1
예제 #13
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description=__name__)
    parser.add_argument('--source-url',
                        '-u',
                        help='Remote URL to read input WARC file from.')
    parser.add_argument('--source-file',
                        '-f',
                        help='Local path to read input WARC file from.')

    args = parser.parse_args()
    # Validate argumentsif(args.command in ['setup-pool'] and args.pool_directory == None):
    if not (args.source_file or args.source_url):
        parser.error(
            "--source-file or --source-url argument must be provided.")

    if args.source_file is not None:
        source_string = args.source_file
        cf = open(args.source_file)
    elif args.source_url is not None:
        source_string = args.source_url
        # Open a connection pool
        http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                                   ca_certs=certifi.where())
        # Open a streaming connection to the specified URL
        cf = http.request('GET', args.source_url, preload_content=False)

    # Wrap the filestream in a streamable unzipper
    f = warc.WARCFile(fileobj=GzipStreamFile(cf))
    warc_records = 0
    warc_responses = 0
    readable_pages = 0
    report_interval = 100

    start_time = arrow.utcnow()
    for record in f:
        if record['WARC-Type'] == 'response':
            warc_responses = warc_responses + 1
    end_time = arrow.utcnow()
    elapsed_time = end_time - start_time
    print("{} response records in file {} ()".format(warc_responses,
                                                     source_string,
                                                     elapsed_time))
예제 #14
0
파일: mrcc.py 프로젝트: tfmorris/cc-mrjob
 def mapper(self, _, line):
     f = None
     ## If we're on EC2 or running on a Hadoop cluster, pull files via S3
     if self.options.runner in ['emr', 'hadoop']:
         # Connect to Amazon S3 using anonymous credentials
         conn = boto.connect_s3(anon=True)
         pds = conn.get_bucket('aws-publicdatasets')
         # Start a connection to one of the WARC files
         k = Key(pds, line)
         f = warc.WARCFile(fileobj=GzipStreamFile(k))
     ## If we're local, use files on the local file system
     else:
         print 'Loading local file {}'.format(line)
         f = warc.WARCFile(fileobj=gzip.open(line))
     ###
     for i, record in enumerate(f):
         for key, value in self.process_record(record):
             yield key, value
         self.increment_counter('commoncrawl', 'processed_records', 1)
예제 #15
0
def get_partial_warc_file(url):
    """
    we use the incredible gzipstreamfile module because of limitations with
    WARC Python module. Seriously, if the module won't have existed, this
    task would have been impossible. Thanks a lot to the creator

    https://github.com/commoncrawl/gzipstream

    return: warc.WARCFile instance
    """
    conn = boto.connect_s3(anon=True)
    pds = conn.get_bucket('aws-publicdatasets')
    # Start a connection to one of the WARC files
    k = Key(pds)
    k.key = url
    wf = warc.WARCFile(fileobj=GzipStreamFile(k))

    for num, record in enumerate(wf):
        try:
            print 'On Record {0}'.format(num)
            payload = record.payload.read()
            if payload[0] == "{":
                r = json.loads(payload)
                description = ''
                title = r['Envelope']['Payload-Metadata'][
                    'HTTP-Response-Metadata']['HTML-Metadata']['Head'][
                        'Title'].encode('utf-8')
                for x in r['Envelope']['Payload-Metadata'][
                        'HTTP-Response-Metadata']['HTML-Metadata']['Head'][
                            'Metas']:
                    if x['name'] == 'description':
                        description = x['content']
                uri = r['Envelope']['WARC-Header-Metadata']['WARC-Target-URI']
                with open('output.csv', 'ab') as f:
                    writer = csv.writer(f)
                    writer.writerow([title, description, uri])
        except:
            pass
예제 #16
0
 def mapper(self, _, line):
     ## Connect to Amazon S3 using anonymous credentials
     conn = boto.connect_s3(anon=True)
     pds = conn.get_bucket('aws-publicdatasets')
     ## Start a connection to one of the WARC files
     k = Key(pds, line)
     f = warc.WARCFile(fileobj=GzipStreamFile(k))
     ###
     for i, record in enumerate(f):
         # WARC records have three different types:
         #  ["application/warc-fields", "application/http; msgtype=request", "application/http; msgtype=response"]
         # We're only interested in the HTTP responses
         if record['Content-Type'] == 'application/http; msgtype=response':
             payload = record.payload.read()
             # The HTTP response is defined by a specification: first part is headers (metadata)
             # and then following two CRLFs (newlines) has the data for the response
             headers, body = payload.split('\r\n\r\n', 1)
             if 'Content-Type: text/html' in headers:
                 # We avoid creating a new Counter for each page as that's actually quite slow
                 tag_count = get_tag_count(body)
                 for tag, count in tag_count.items():
                     yield tag, count
                 self.increment_counter('commoncrawl', 'processed_pages', 1)
예제 #17
0
 def mapper(self, _, line):
     """
     Override default mapper. Not yielding anything
     """
     # Connect to Amazon S3 using anonymous credentials
     boto_config = botocore.client.Config(
         signature_version=botocore.UNSIGNED,
         read_timeout=180,
         retries={'max_attempts': 20})
     s3client = boto3.client('s3', config=boto_config)
     # Check if the bucket exist
     try:
         s3client.head_bucket(Bucket='commoncrawl')
     except botocore.exceptions.ClientError as exception:
         LOG.error('Failed to access bucket "commoncrawl": %s', exception)
         return
     # Check if the input exists
     try:
         s3client.head_object(Bucket='commoncrawl', Key=line)
     except botocore.client.ClientError as exception:
         LOG.error('Input not found: %s', line)
         return
     # Download input files
     LOG.info('Downloading s3://commoncrawl/%s', line)
     try:
         temp = TemporaryFile(mode='w+b',
                              dir=self.options.s3_local_temp_dir)
         s3client.download_fileobj('commoncrawl', line, temp)
     except botocore.client.ClientError as exception:
         LOG.error('Failed to download %s: %s', line, exception)
         return
     temp.seek(0)
     ccfile = warc.WARCFile(fileobj=(GzipStreamFile(temp)))
     LOG.info('Attempting MapReduce Job......')
     for _i, record in enumerate(ccfile):  #don't yield the result
         self.process_record(record)
         self.increment_counter('commoncrawl', 'processed_records', 1)
예제 #18
0
 def mapper(self, _, line):
   f = None
   ## If we're on EC2 or running on a Hadoop cluster, pull files via S3
   if line.startswith("s3://"):
   
     print('Downloading ...',file=sys.stderr)
     key = None
     
     # Connect to Amazon S3 using anonymous credentials
     conn = boto.connect_s3(anon=True)
     if line.startswith("s3://"):
        pathStart = line.index('/',5)
        bucketName = line[5:pathStart]
        keyPath = line[pathStart+1:]
        print("Bucket: "+bucketName,file=sys.stderr)
        print("Key: "+keyPath,file=sys.stderr)
        bucket = conn.get_bucket(bucketName)
        key = Key(bucket,keyPath)
     else:
        print("Bucket: aws-publicdatasets",file=sys.stderr)
        print("Key: "+line,file=sys.stderr)
        bucket = conn.get_bucket("aws-publicdatasets")
        key = Key(bucket,line)
     # Start a connection to one of the WARC files
     f = warc.WARCFile(fileobj=GzipStreamFile(key))
     
   ## If we're local, use files on the local file system
   else:
     if line.startswith("file:///"):
        line = line[7:]
     print("Local: {}".format(line),file=sys.stderr)
     f = warc.WARCFile(fileobj=gzip.open(line))
   ###
   for i, record in enumerate(f):
     for key, value in self.process_record(record):
       yield key, value
     self.increment_counter('commoncrawl', 'processed_records', 1)
예제 #19
0
    def open_dump(self):
        """ Returns a file-like object for the dump """

        if config["TESTDATA"] == "1":
            return open(self.dump_testdata, "rb")
        else:
            f = urllib2.urlopen(self.dump_url)

            if self.dump_compression == "zip":

                file_name = self.dump_compression_params[0]

                # TODO: is there a more efficient way of doing this? the file object passed to ZipFile
                # need to support .seek()
                zfile = zipfile.ZipFile(StringIO.StringIO(f.read()))
                return StringIO.StringIO(zfile.read(file_name))

            elif self.dump_compression == "gz":

                f.__dict__["closed"] = False  # Hack for GzipStreamFile
                return GzipStreamFile(f)

            else:
                return f
예제 #20
0
conn = boto.connect_s3(anon=True, debug=2)
bucket = conn.get_bucket('commoncrawl')
list1 = bucket.list(prefix="crawl-data/CC-MAIN")

#list1=bucket.get_all_keys(maxkeys=0)
lookup = raw_input("Enter Lookup")
for key in list1:
    #print key
    #print dir(key)
    #print key.name
    if "wet" in key.name and lookup in key.name:
        print key
        if key.name in dicta:
            continue
        try:
            for l in GzipStreamFile(key):
                #print l
                result = prog.findall(l)
                for r in result:
                    #print l
                    #print r
                    #raw_input()
                    domain = r[0].split("@")[1]
                    table.insert(
                        dict(domain=unidecode(domain),
                             email=unidecode(r[0]),
                             text=unidecode(l)))
        except:
            time.sleep(60)
            conn = boto.connect_s3(anon=True, debug=2)
            bucket = conn.get_bucket('commoncrawl')
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description=__name__)
    parser.add_argument('--source-url',
                        '-u',
                        help='Remote URL to read input WARC file from.')
    parser.add_argument('--source-file',
                        '-f',
                        help='Local path to read input WARC file from.')
    parser.add_argument('--output-dir',
                        '-o',
                        help='Directory to write processed web pages to.')
    parser.add_argument(
        '--max-pages',
        '-m',
        type=int,
        help='Maximum number of web pages to process from WARC file.')

    args = parser.parse_args()
    # Validate argumentsif(args.command in ['setup-pool'] and args.pool_directory == None):
    if not (args.source_file or args.source_url):
        parser.error(
            "--source-file or --source-url argument must be provided.")

    # Make sure output directories exists
    original_pages_dir = os.path.join(args.output_dir, 'original')
    readable_pages_dir = os.path.join(args.output_dir, 'readable')
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    if not os.path.exists(original_pages_dir):
        os.makedirs(original_pages_dir)
    if not os.path.exists(readable_pages_dir):
        os.makedirs(readable_pages_dir)

    if args.source_file is not None:
        cf = open(args.source_file)
    elif args.source_url is not None:
        # Open a connection pool
        http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                                   ca_certs=certifi.where())
        # Open a streaming connection to the specified URL
        cf = http.request('GET', args.source_url, preload_content=False)

    # Wrap the filestream in a streamable unzipper
    f = warc.WARCFile(fileobj=GzipStreamFile(cf))
    warc_records = 0
    warc_responses = 0
    readable_pages = 0
    report_interval = 100

    start_time = arrow.utcnow()
    for record in f:
        if record['WARC-Type'] == 'response':
            if (args.max_pages and warc_responses >= args.max_pages):
                print("Reached maximum WARC responses ({})".format(
                    args.max_pages))
                break
            warc_responses = warc_responses + 1
            try:
                id = record.header["WARC-Record-ID"][10:-1]
                fp = record.payload
                # Open file using WARC Record ID as filename
                original_page_path = os.path.join(original_pages_dir,
                                                  "{}.txt".format(id))
                readable_page_path = os.path.join(readable_pages_dir,
                                                  "{}.txt".format(id))
                with open(original_page_path, 'w') as fout:
                    while True:
                        # Discard Header rows
                        line = fp.readline()
                        # Header rows are separated from page contents by a blank line
                        if line == "\r\n":
                            break
                    # Write page contents to file
                    fout.write(fp.read())
                # Process page with readability script
                subprocess.check_call([
                    'node', 'page_to_readable_page.js', original_page_path,
                    readable_page_path
                ])
                readable_pages = readable_pages + 1
                #  TODO: Persist file to blob storage and remove readable file
            except:
                pass
            # Clean up files created during processing
            try:
                os.remove(original_page_path)
            except:
                pass
            if warc_responses % report_interval == 0:
                print("Processed {} WARC pages ({} readable pages)".format(
                    warc_responses, readable_pages))
    end_time = arrow.utcnow()
    elapsed_time = end_time - start_time
    print("Processed {} WARC pages ({} readable pages) in {}".format(
        warc_responses, readable_pages, elapsed_time))
예제 #22
0
filepath = 'crawl-data/CC-MAIN-2016-50/segments/1480698540409.8/warc/CC-MAIN-20161202170900-00000-ip-10-31-129-80.ec2.internal.warc.gz'

# establish anonymous connection to commoncrawl warc file bucket
conn = boto.s3.connect_to_region(
    "us-east-1",
    anon=True,
    calling_format=boto.s3.connection.OrdinaryCallingFormat(),
    is_secure=False)
bucket = conn.get_bucket('commoncrawl')
#filereader = Key(bucket)
filereader = boto.s3.key.Key(bucket)
filereader.key = filepath

# Updating code to python3 standards, GzipStreamFile is really just GzipFile

warc_file = warc.WARCFile(fileobj=GzipStreamFile(filereader))

for record in warc_file:
    print(record.payload.read())
#warc_file = warc.WARCFile(fileobj=GzipFile(filereader))

#warc_file = warc.WARCFile(fileobj=filereader)

#warc_stream = open_warc_stream(partition["path"])

#for record in warc_stream:
#    print(record.payload.read())
#payload = record.payload.read()
#parser = HttpParser()
#parser.execute(payload, len(payload)()
'''
    try:
        s3client.head_object(Bucket='commoncrawl', Key=line)
    except botocore.client.ClientError as exception:
        LOG.error('Input not found: %s', line)

    # Start a connection to one of the WARC/WAT/WET files
    LOG.info('Loading s3://commoncrawl/%s', line)
    try:
        temp = TemporaryFile(
            mode='w+b',
            dir=
            'C:\Users\Aditya\Documents\demonstrational\Radii Corporation\common-crawl-extractor'
        )
        s3client.download_fileobj('commoncrawl', line, temp)
    except botocore.client.ClientError as exception:
        LOG.error('Failed to download %s: %s', line, exception)

    temp.seek(0)

    # The warc library accepts file like objects, so let's use GzipStreamFile
    ccfile = warc.WARCFile(fileobj=(GzipStreamFile(temp)))
    for num, record in enumerate(ccfile):
        if record['WARC-Type'] == 'response':
            # Imagine we're interested in the URL, the length of content, and any Content-Type strings in there
            print(record['WARC-Target-URI'], record['Content-Length'])
            print('\n'.join(x for x in record.payload.read().replace(
                '\r', '').split('\n\n')[0].split('\n')
                            if 'content-type:' in x.lower()))
            print('=-=-' * 10)
        if num > 100:
            break
예제 #24
0
def getHeaders(id_, iterator):

    conn = S3Connection(host="s3.amazonaws.com")
    bucket = conn.get_bucket("commoncrawl")

    for uri in iterator:
        key_ = Key(bucket, uri)
        file_ = warc.WARCFile(fileobj=GzipStreamFile(key_))

        for line in file_:
            try:
                data = json.loads(line.payload.read())

                #------------------------ BUILD DICTIONARY ------------------------------+
                #  Purpose:   FOR EVERY RESPONSE RECORD IN THE CURRENT WAT FILE,
                #     CODE BLOCK CREATES A DICTIONARY OBJECT retArray CONTAINING
                #     TWO ELEMENTS:
                #       -MD5 HASH OUTPUT OF HOSTNAME
                #       -AN INTEGER, WHICH WHEN DISPLAYED IN BINARY HAS ONE BIT
                #       REPRESENTATIVE OF THE PRESENCE OF EACH PARTICULAR HEADER
                #     IF ANY EXCEPTIONS ARE THROWN, DISREGARD AND CONTINUE.
                #
                #  Parameters:
                #     -HTTP RESPONSE SECURITY HEADERS FROM CURRENT WAT RECORD
                #     -FLAG BIT VARIABLES REPRESENTATIVE OF EACH HEADER
                #
                #  Result:    DICTIONARY OBJECT REPRESENTING ONE WAT RECORD
                #------------------------------------------------------------------------+
                retArray = [None, 0b000000000000000000000]
                if (data["Envelope"]["WARC-Header-Metadata"]["WARC-Type"] ==
                        "response"):

                    retArray[0] = hashlib.md5(
                        urlparse(data["Envelope"]["WARC-Header-Metadata"].get(
                            "WARC-Target-URI", "")).hostname).digest()
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "X-XSS-Protection", "") != ""):
                        retArray[1] = retArray[1] | X_XSS_Protection_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "Content-Security-Policy", "") != ""):
                        retArray[
                            1] = retArray[1] | Content_Security_Policy_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "X-Content-Security-Policy", "") != ""):
                        retArray[
                            1] = retArray[1] | X_Content_Security_Policy_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "X-Frame-Options", "") != ""):
                        retArray[1] = retArray[1] | X_Frame_Options_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "Strict-Transport-Security", "") != ""):
                        retArray[
                            1] = retArray[1] | Strict_Transport_Security_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "X-Content-Type-Options", "") != ""):
                        retArray[1] = retArray[1] | X_Content_Type_Options_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "X-Download-Options", "") != ""):
                        retArray[1] = retArray[1] | X_Download_Options_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "X-Permitted-Cross-Domain-Policies", "") != ""):
                        retArray[1] = retArray[
                            1] | X_Permitted_Cross_Domain_Policies_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "Expect-CT", "") != ""):
                        retArray[1] = retArray[1] | Expect_CT_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "Feature-Policy", "") != ""):
                        retArray[1] = retArray[1] | Feature_Policy_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "Referrer-Policy", "") != ""):
                        retArray[1] = retArray[1] | Referrer_Policy_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "X-Public-Key-Pins", "") != ""):
                        retArray[1] = retArray[1] | X_Public_Key_Pins_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "X-Public-Key-Pins-Report-Only", "") != ""):
                        retArray[1] = retArray[
                            1] | X_Public_Key_Pins_Report_Only_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "Public-Key-Pins", "") != ""):
                        retArray[1] = retArray[1] | Public_Key_Pins_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "Public-Key-Pins-Report-Only", "") != ""):
                        retArray[
                            1] = retArray[1] | Public_Key_Pins_Report_Only_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "Access-Control-Allow-Origin", "") != ""):
                        retArray[
                            1] = retArray[1] | Access_Control_Allow_Origin_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "Access-Control-Allow-Credentials", "") != ""):
                        retArray[1] = retArray[
                            1] | Access_Control_Allow_Credentials_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "Access-Control-Allow-Methods", "") != ""):
                        retArray[1] = retArray[
                            1] | Access_Control_Allow_Methods_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "Access-Control-Allow-Headers", "") != ""):
                        retArray[1] = retArray[
                            1] | Access_Control_Allow_Headers_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "Access-Control-Expose-Headers", "") != ""):
                        retArray[1] = retArray[
                            1] | Access_Control_Expose_Headers_FLAG
                    if (data["Envelope"]["Payload-Metadata"]
                        ["HTTP-Response-Metadata"]["Headers"].get(
                            "Access-Control-Max-Age", "") != ""):
                        retArray[1] = retArray[1] | Access_Control_Max_Age_FLAG

                    yield retArray

            except ValueError:
                continue
            except KeyError:
                continue
            except UnboundLocalError:
                continue
예제 #25
0
import boto
from boto.s3.key import Key
from gzipstream import GzipStreamFile
import warc

if __name__ == '__main__':
    # Let's use a random gzipped web archive (WARC) file from the 2014-15 Common Crawl dataset
    ## Connect to Amazon S3 using anonymous credentials
    conn = boto.connect_s3(anon=True)
    pds = conn.get_bucket('aws-publicdatasets')
    ## Start a connection to one of the WARC files
    k = Key(pds)
    k.key = 'common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/CC-MAIN-20140416005201-00000-ip-10-147-4-33.ec2.internal.warc.gz'

    # The warc library accepts file like objects, so let's use GzipStreamFile
    f = warc.WARCFile(fileobj=GzipStreamFile(k))
    for num, record in enumerate(f):
        if record['WARC-Type'] == 'response':
            # Imagine we're interested in the URL, the length of content, and any Content-Type strings in there
            print record['WARC-Target-URI'], record['Content-Length']
            print '\n'.join(x for x in record.payload.read().replace(
                '\r', '').split('\n\n')[0].split('\n')
                            if 'content-type:' in x.lower())
            print '=-=-' * 10
        if num > 100:
            break