Python ArchiveRecord 예제들, hanzo.warctools.ArchiveRecord Python 예제들

예제 #1

0

파일 보기

파일: warcfilter.py 프로젝트: ArchiveTeam/warctozip

def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        parser.error("no pattern")

        
    pattern, input_files = input_files[0], input_files[1:]


    invert = options.invert
    out = sys.stdout
    pattern = re.compile(pattern)
    if not input_files:
            fh = ArchiveRecord.open_archive(file_handle=sys.stdin, gzip=None)
            filter_archive(fh, options, pattern, out)
    else:
        for name in input_files:
            fh = ArchiveRecord.open_archive(name, gzip="auto")
            filter_archive(fh, options, pattern,out)
            fh.close()



    return 0

예제 #2

0

파일 보기

파일: warcunpack_ia.py 프로젝트: JoeAcanfora/ProjFocusedCrawler

def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if options.output:
        if not os.path.exists(options.output):
            os.makedirs(options.output)
        output_dir =  options.output
    else:
        output_dir  = os.getcwd()

    collisions = 0


    if len(args) < 1:
        log_file = sys.stdout if not options.log_file else open(options.log_file, 'wb')
        log_headers(log_file)
        
        with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh:
            collisions += unpack_records('<stdin>', fh, output_dir, options.default_name, log_file, options.wayback)
        
    else:
        for filename in args:
            
            log_file = os.path.join(output_dir, os.path.basename(filename)+ '.index.txt') if not options.log_file else options.log_file
            log_file = open(log_file, 'wb')
            log_headers(log_file)
            try:
                with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh:
                    collisions+=unpack_records(filename, fh, output_dir, options.default_name, log_file, options.wayback)

            except StandardError, e:
                print >> sys.stderr, "exception in handling", filename, e

예제 #3

0

파일 보기

파일: warcunpack_ia.py 프로젝트: mohfarag/General

def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if options.output:
        if not os.path.exists(options.output):
            os.makedirs(options.output)
        output_dir =  options.output
    else:
        output_dir  = os.getcwd()

    collisions = 0


    if len(args) < 1:
        log_file = sys.stdout if not options.log_file else open(options.log_file, 'wb')
        log_headers(log_file)
        
        with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh:
            collisions += unpack_records('<stdin>', fh, output_dir, options.default_name, log_file, options.wayback)
        
    else:
        for filename in args:
            
            log_file = os.path.join(output_dir, os.path.basename(filename)+ '.index.txt') if not options.log_file else options.log_file
            log_file = open(log_file, 'wb')
            log_headers(log_file)
            try:
                with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh:
                    collisions+=unpack_records(filename, fh, output_dir, options.default_name, log_file, options.wayback)

            except StandardError, e:
                print >> sys.stderr, "exception in handling", filename, e

예제 #4

0

파일 보기

파일: warcvalid.py 프로젝트: eadmaster/warctozip

def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        parser.error("no imput warc file(s)")
        

    correct=True
    fh=None
    try:
        for name in input_files:
            fh = ArchiveRecord.open_archive(name, gzip="auto")

            for (offset, record, errors) in fh.read_records(limit=None):
                if errors:
                    print  >> sys.stderr, "warc errors at %s:%d"%(name, offset)
                    print >> sys.stderr,  errors
                    correct=False

                    break
                elif record is not None and record.validate(): # ugh name, returns errorsa
                    print  >> sys.stderr, "warc errors at %s:%d"%(name, offset)
                    print >> sys.stderr, record.validate()
                    correct=False
                    break
                

    except StandardError, e:
        correct=False

예제 #5

0

파일 보기

파일: warc_metadata_parser.py 프로젝트: aalsum/archive-analysis

    def parse_metadata(self):
        
	fh = ArchiveRecord.open_archive(self.file, gzip="auto", mode="r")
        for (offset, record, errors) in fh.read_records(limit=None, offsets=True):
            self.offset = offset

            if record:
                if record.type != 'metadata':
                        continue
                ### precalculated data that is used multiple times
                self.headers, self.content = self.parse_headers_and_content(record)

		result = None
		if (self.parseType == "hopinfo"):
			result = self.get_hopinfo(record)
		elif (self.parseType == "outlinks"):
			result = self.get_outlinks(record)
		else:
			sys.exit("Invalid parseType option: " + self.parseType)
		if result:
			print result	
            elif errors:
                sys.exit("Exiting with the following errors:\n" + str(errors))
            else:
                pass # tail
        fh.close()

예제 #6

0

파일 보기

파일: warcindex.py 프로젝트: ArchiveTeam/warctozip

def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        parser.error("no imput warc file(s)")
        
    print '#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length'
    for name in input_files:
        fh = ArchiveRecord.open_archive(name, gzip="auto")

        for (offset, record, errors) in fh.read_records(limit=None):
            if record:
                print name, offset, record.type, record.url, record.id, record.content_type, record.content_length
            elif errors:
                pass
                # ignore
            else:
                pass
                # no errors at tail




        fh.close()



    return 0

예제 #7

0

파일 보기

파일: eventUtils.py 프로젝트: mohamed83/eventModel

def expandWarcFile(warcFile):
#     if (len(argv) < 1):
#         print >> sys.stderr, "usage: processWarcDir.py -d <directory> -i <collection_id> -e <event> -t <event_type>"
#         sys.exit()
#         
#     if (argv[0] == "-h" or  len(argv) < 4):
#         print >> sys.stderr, "usage: processWarcDir.py -d <directory> -i <collection_id> -e <event> -t <event_type>"
#         sys.exit()
    
    
    rootdir = os.path.dirname(warcFile)
    filename = os.path.basename(warcFile)
    filePath =warcFile
    if filename.endswith(".warc") or filename.endswith(".warc.gz"):# or filename.endswith(".arc.gz"):
		# processWarcFile(filePath, collection_id, event, event_type)
		splitext = filePath.split('.')
		output_dir = splitext[0] + "/"
		
		log_file = os.path.join(output_dir, filePath[filePath.rfind("/")+1:] + '.index.txt')
		
		# output_file = output_dir + filePath.split("/")[1] + ".index.txt"
		if os.path.exists(output_dir) == False:                    
		
			os.makedirs(output_dir)

			# unpackWarcAndRetrieveHtml(filePath, collection_id, event, event_type)
			# output_dir = filePath.split(".")[0] + "/"
			default_name = 'crawlerdefault'
			wayback = "http://wayback.archive-it.org/"
			collisions = 0
				
			#log_file = os.path.join(output_dir, filePath[filePath.rfind("/")+1:] + '.index.txt')
			
			log_fileh = open(log_file, 'w+b')
			warcunpack_ia.log_headers(log_fileh)
		
			try:
				with closing(ArchiveRecord.open_archive(filename=filePath, gzip="auto")) as fh:
					collisions += warcunpack_ia.unpack_records(filePath, fh, output_dir, default_name, log_fileh, wayback)
		
			except StandardError, e:
				print "exception in handling", filePath, e
				return
		else:
			print "Directory Already Exists"
		
			#print "Warc unpack finished"
		
		html_files = parseLogFileForHtml(log_file)
		#print "Log file parsed for html files pathes"
		#print len(html_files)
		
		# for i in html_files:
			# extractTextAndIndexToSolr(i["file"], i["url"], i["wayback_url"], collection_id, event, event_type)
		tf,urls = extractText(html_files)
		#print "extracting Text finished"
		return tf,urls

예제 #8

0

파일 보기

def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        parser.error("no pattern")

    pattern, input_files = input_files[0], input_files[1:]

    invert = options.invert
    out = sys.stdout
    pattern = re.compile(pattern)
    if not input_files:
        fh = ArchiveRecord.open_archive(file_handle=sys.stdin, gzip=None)
        filter_archive(fh, options, pattern, out)
    else:
        for name in input_files:
            fh = ArchiveRecord.open_archive(name, gzip="auto")
            filter_archive(fh, options, pattern, out)
            fh.close()

    return 0

예제 #9

0

파일 보기

파일: warcdump.py 프로젝트: ArchiveTeam/warctozip

def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None), name="-",offsets=False)
        
    else:
        for name in input_files:
            fh = ArchiveRecord.open_archive(name, gzip="auto")
            dump_archive(fh,name)

            fh.close()


    return 0

예제 #10

0

파일 보기

파일: warcdump.py 프로젝트: eadmaster/warctozip

def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None),
                     name="-",
                     offsets=False)

    else:
        for name in input_files:
            fh = ArchiveRecord.open_archive(name, gzip="auto")
            dump_archive(fh, name)

            fh.close()

    return 0

예제 #11

0

파일 보기

파일: warctozip.py 프로젝트: ArchiveTeam/warctozip

def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(args) < 1:
        # dump the first record on stdin
        with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh:
            dump_record(fh)
        
    else:
        filename = args[0]
        zipfilename = args[1]

        with ZipFile(zipfilename, "w") as outzip:
            with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh:
                dump_record(fh, outzip)


    return 0

예제 #12

0

파일 보기

파일: cdx_writer.py 프로젝트: cclauss/CDX-Writer

    def _make_cdx(self, stats):
        self.out_file.write(b' CDX ' + self.format + b'\n')  #print header

        fh = ArchiveRecord.open_archive(self.file, gzip="auto", mode="r")
        for (offset, record, errors) in fh.read_records(limit=None,
                                                        offsets=True):
            if not record:
                if errors:
                    raise ParseError(str(errors))
                continue  # tail

            stats['num_records_processed'] += 1
            handler = self.dispatcher.get_handler(record,
                                                  offset=offset,
                                                  cdx_writer=self)
            if not handler:
                continue

            ### arc files from the live web proxy can have a negative content length and a missing payload
            ### check the content_length from the arc header, not the computed payload size returned by record.content_length
            content_length_str = record.get_header(record.CONTENT_LENGTH)
            if content_length_str is not None and int(content_length_str) < 0:
                continue

            surt = handler.massaged_url
            if self.should_exclude(surt):
                stats['num_records_filtered'] += 1
                continue

            ### precalculated data that is used multiple times
            # self.headers, self.content = self.parse_headers_and_content(record)
            # self.mime_type             = self.get_mime_type(record, use_precalculated_value=False)

            values = [
                b'-' if v is None else v for v in self.fieldgetter(handler)
            ]
            self.out_file.write(b' '.join(values) + b'\n')
            #record.dump()
            stats['num_records_included'] += 1

        fh.close()

예제 #13

0

파일 보기

def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(args) < 1:
        # dump the first record on stdin
        with closing(WarcRecord.open_archive(file_handle=sys.stdin,
                                             gzip=None)) as fh:
            dump_record(fh)

    else:
        filename = args[0]
        zipfilename = args[1]

        with ZipFile(zipfilename, "w") as outzip:
            with closing(
                    ArchiveRecord.open_archive(filename=filename,
                                               gzip="auto")) as fh:
                dump_record(fh, outzip)

    return 0

예제 #14

0

파일 보기

파일: warcdumpall.py 프로젝트: NLCR/NDK-validator

def main(argv):
    (options, input_files) = parser.parse_args()
    out = sys.stdout
    
    if not options.output_directory:
        parser.error("option -o is mandatory")
        
    if not os.path.isdir(options.output_directory):
        os.makedirs(options.output_directory)
    
    if len(input_files) < 1:
        parser.error("list of warc files is mandatory")
        
    else:
        for name in input_files:
            fh = ArchiveRecord.open_archive(name, gzip="auto")
            dump_archive(fh,name,options.output_directory)

            fh.close()

    return 0

예제 #15

0

파일 보기

파일: warcextract.py 프로젝트: ArchiveTeam/warctozip

def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(args) < 1:
        # dump the first record on stdin
        with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh:
            dump_record(fh)
        
    else:
        # dump a record from the filename, with optional offset
        filename = args[0]
        if len(args) > 1:
            offset = int(args[1])
        else:
            offset = 0

        with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh:
            fh.seek(offset)
            dump_record(fh)


    return 0

예제 #16

0

파일 보기

def main(argv):
    (options, args) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(args) < 1:
        # dump the first record on stdin
        with closing(WarcRecord.open_archive(file_handle=sys.stdin,
                                             gzip=None)) as fh:
            dump_record(fh)

    else:
        # dump a record from the filename, with optional offset
        filename = args[0]
        if len(args) > 1:
            offset = int(args[1])
        else:
            offset = 0

        with closing(ArchiveRecord.open_archive(filename=filename,
                                                gzip="auto")) as fh:
            fh.seek(offset)
            dump_record(fh)

    return 0

예제 #17

0

파일 보기

파일: cdx_writer.py 프로젝트: internetarchive/CDX-Writer

    def _make_cdx(self, stats):
        self.out_file.write(b' CDX ' + self.format + b'\n') #print header

        fh = ArchiveRecord.open_archive(self.file, gzip="auto", mode="r")
        for (offset, record, errors) in fh.read_records(limit=None, offsets=True):
            if not record:
                if errors:
                    raise ParseError(str(errors))
                continue # tail

            stats['num_records_processed'] += 1
            handler = self.dispatcher.get_handler(record, offset=offset, cdx_writer=self)
            if not handler:
                continue

            ### arc files from the live web proxy can have a negative content length and a missing payload
            ### check the content_length from the arc header, not the computed payload size returned by record.content_length
            content_length_str = record.get_header(record.CONTENT_LENGTH)
            if content_length_str is not None and int(content_length_str) < 0:
                continue

            surt = handler.massaged_url
            if self.should_exclude(surt):
                stats['num_records_filtered'] += 1
                continue

            ### precalculated data that is used multiple times
            # self.headers, self.content = self.parse_headers_and_content(record)
            # self.mime_type             = self.get_mime_type(record, use_precalculated_value=False)

            values = [b'-' if v is None else v for v in self.fieldgetter(handler)]
            self.out_file.write(b' '.join(values) + b'\n')
            #record.dump()
            stats['num_records_included'] += 1

        fh.close()

예제 #18

0

파일 보기

파일: warcindex.py 프로젝트: eadmaster/warctozip

def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        parser.error("no imput warc file(s)")

    print '#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length'
    for name in input_files:
        fh = ArchiveRecord.open_archive(name, gzip="auto")

        for (offset, record, errors) in fh.read_records(limit=None):
            if record:
                print name, offset, record.type, record.url, record.id, record.content_type, record.content_length
            elif errors:
                pass
                # ignore
            else:
                pass
                # no errors at tail

        fh.close()

    return 0

예제 #19

0

파일 보기

def expandWarcFile(warcFile):
    #     if (len(argv) < 1):
    #         print >> sys.stderr, "usage: processWarcDir.py -d <directory> -i <collection_id> -e <event> -t <event_type>"
    #         sys.exit()
    #
    #     if (argv[0] == "-h" or  len(argv) < 4):
    #         print >> sys.stderr, "usage: processWarcDir.py -d <directory> -i <collection_id> -e <event> -t <event_type>"
    #         sys.exit()

    rootdir = os.path.dirname(warcFile)
    filename = os.path.basename(warcFile)
    filePath = warcFile
    if filename.endswith(".warc") or filename.endswith(
            ".warc.gz"):  # or filename.endswith(".arc.gz"):
        # processWarcFile(filePath, collection_id, event, event_type)
        splitext = filePath.split('.')
        output_dir = splitext[0] + "/"

        log_file = os.path.join(
            output_dir, filePath[filePath.rfind("/") + 1:] + '.index.txt')

        # output_file = output_dir + filePath.split("/")[1] + ".index.txt"
        if os.path.exists(output_dir) == False:

            os.makedirs(output_dir)

            # unpackWarcAndRetrieveHtml(filePath, collection_id, event, event_type)
            # output_dir = filePath.split(".")[0] + "/"
            default_name = 'crawlerdefault'
            wayback = "http://wayback.archive-it.org/"
            collisions = 0

            #log_file = os.path.join(output_dir, filePath[filePath.rfind("/")+1:] + '.index.txt')

            log_fileh = open(log_file, 'w+b')
            warcunpack_ia.log_headers(log_fileh)

            try:
                with closing(
                        ArchiveRecord.open_archive(filename=filePath,
                                                   gzip="auto")) as fh:
                    collisions += warcunpack_ia.unpack_records(
                        filePath, fh, output_dir, default_name, log_fileh,
                        wayback)

            except StandardError, e:
                print "exception in handling", filePath, e
                return
        else:
            print "Directory Already Exists"

            #print "Warc unpack finished"

        html_files = parseLogFileForHtml(log_file)
        #print "Log file parsed for html files pathes"
        #print len(html_files)

        # for i in html_files:
        # extractTextAndIndexToSolr(i["file"], i["url"], i["wayback_url"], collection_id, event, event_type)
        tf, urls = extractText(html_files)
        #print "extracting Text finished"
        return tf, urls

예제 #20

0

파일 보기

파일: processWarcFiles.py 프로젝트: mohfarag/General

def main(argv):
    #     if (len(argv) < 1):
    #         print >> sys.stderr, "usage: processWarcDir.py -d <directory> -i <collection_id> -e <event> -t <event_type>"
    #         sys.exit()
    #
    #     if (argv[0] == "-h" or  len(argv) < 4):
    #         print >> sys.stderr, "usage: processWarcDir.py -d <directory> -i <collection_id> -e <event> -t <event_type>"
    #         sys.exit()
    #Done
    #argv = ["","/home/mohamed/IACollections/rem","","3647","","Texas Fertilizer Plant Explosion","","Accident"]
    #argv = ["","/home/mohamed/IACollections/3437","","3437","","Connecticut School Shooting","","Shooting"]
    #argv = ["","/home/mohamed/IACollections/2305","","2305","","Tucson Shooting","","Shooting"]
    #argv = ["","/home/mohamed/IACollections/2823","","2823","","Russia Plane Crash","","Plane_Crash"]
    #argv = ["","/home/mohamed/IACollections/2379","","2379","","Youngstown Shooting","","Shooting"]
    #argv = ["","/home/mohamed/IACollections/2772","","2772","","Norway Shooting","","Shooting"]
    #argv = ["","/home/mohamed/IACollections/694","","694","","April 16 Shooting","","Shooting"]
    #argv = ["","/home/mohamed/IACollections/2892","","2892","","Somalia_Bomb_Blast","","Bombing"]
    #argv = ["","/home/mohamed/IACollections/2838","","2838","","Nevada_AirRace_Crash","","Plane_Crash"]
    #argv = ["","/home/mohamed/IACollections/2822","","2822","","Texas_Wild_Fire","","Fire"]
    #argv = ["","/home/mohamed/IACollections/2882","","2882","","Encephalitis","","Disease_Outbreak"]
    #argv = ["","/home/mohamed/IACollections/2842","","2842","","China_Flood","","Flood"]
    #argv = ["","/home/mohamed/IACollections/2836","","2836","","Pakistan_Flood","","Flood"]
    #argv = ["","/home/mohamed/IACollections/3535","","3535","","Brazil_NightClub_Fire","","Fire"]
    #argv = ["","/home/mohamed/IACollections/2316","","2316","","Haiti_Earthquake_Anniversary","","Earthquake"]
    #argv = ["","/home/mohamed/IACollections/2406","","2406","","New_Zealand_Earthquake","","Earthquake"]
    #argv = ["","/home/mohamed/IACollections/2821","","2821","","Virginia_Earthquake","","Earthquake"]
    #Not Yet

    argv = [
        "", "/home/mohamed/IACollections/2903", "", "2903", "",
        "Turkey_Earthquake", "", "Earthquake"
    ]

    rootdir = argv[1]
    collection_id = argv[3]
    event = argv[5]
    event_type = argv[7]

    for root, subFolders, files in os.walk(rootdir):
        for filename in files:
            filePath = os.path.join(root, filename)
            if filename.endswith(".warc") or filename.endswith(
                    ".warc.gz"):  # or filename.endswith(".arc.gz"):
                # processWarcFile(filePath, collection_id, event, event_type)
                splitext = filePath.split('.')
                output_dir = splitext[0] + "/"

                log_file = os.path.join(
                    output_dir,
                    filePath[filePath.rfind("/") + 1:] + '.index.txt')

                # output_file = output_dir + filePath.split("/")[1] + ".index.txt"
                if os.path.exists(output_dir) == False:

                    os.makedirs(output_dir)

                    # unpackWarcAndRetrieveHtml(filePath, collection_id, event, event_type)
                    # output_dir = filePath.split(".")[0] + "/"
                    default_name = 'crawlerdefault'
                    wayback = "http://wayback.archive-it.org/"
                    collisions = 0

                    #log_file = os.path.join(output_dir, filePath[filePath.rfind("/")+1:] + '.index.txt')

                    log_fileh = open(log_file, 'w+b')
                    warcunpack_ia.log_headers(log_fileh)

                    try:
                        with closing(
                                ArchiveRecord.open_archive(filename=filePath,
                                                           gzip="auto")) as fh:
                            collisions += warcunpack_ia.unpack_records(
                                filePath, fh, output_dir, default_name,
                                log_fileh, wayback)

                    except StandardError, e:
                        print >> sys.stderr, "exception in handling", filePath, e
                        return

                    print "Warc unpack finished"

                html_files = parseLogFileForHtml(log_file)
                print "Log file parsed for html files pathes"

                # for i in html_files:
                # extractTextAndIndexToSolr(i["file"], i["url"], i["wayback_url"], collection_id, event, event_type)
                extractTextAndIndexToSolr(html_files, collection_id, event,
                                          event_type)
                print "Storing and Indexing finished"

예제 #21

0

파일 보기

    def make_cdx(self):
        if isinstance(self.out_file, basestring):
            self.out_file = open(self.out_file, 'wb')
        self.out_file.write(' CDX ' + self.format + '\n') #print header

        if not self.all_records:
            #filter cdx lines if --all-records isn't specified
            allowed_record_types     = set(['response', 'revisit'])
            disallowed_content_types = set(['text/dns'])

        stats = {
            'num_records_processed': 0,
            'num_records_included':  0,
            'num_records_filtered':  0,
        }

        fh = ArchiveRecord.open_archive(self.file, gzip="auto", mode="r")
        for (offset, record, errors) in fh.read_records(limit=None, offsets=True):
            self.offset = offset

            if record:
                stats['num_records_processed'] += 1
                if self.screenshot_mode:
                    if record.type != 'metadata':
                        continue
                elif not self.all_records and (record.type not in allowed_record_types or record.content_type in disallowed_content_types):
                    continue

                ### arc files from the live web proxy can have a negative content length and a missing payload
                ### check the content_length from the arc header, not the computed payload size returned by record.content_length
                content_length_str = record.get_header(record.CONTENT_LENGTH)
                if content_length_str is not None and int(content_length_str) < 0:
                    continue

                self.surt = self.get_massaged_url(record, use_precalculated_value=False)
                if self.should_exclude(self.surt):
                    stats['num_records_filtered'] += 1
                    continue

                ### precalculated data that is used multiple times
                self.headers, self.content = self.parse_headers_and_content(record)
                self.mime_type             = self.get_mime_type(record, use_precalculated_value=False)
                self.response_code         = self.get_response_code(record, use_precalculated_value=False)
                self.meta_tags             = self.parse_meta_tags(record)

                s = u''
                for field in self.format.split():
                    if not field in self.field_map:
                        raise ParseError('Unknown field: ' + field)

                    endpoint = self.field_map[field].replace(' ', '_')
                    response = getattr(self, 'get_' + endpoint)(record)
                    #print self.offset
                    #print record.compressed_record_size
                    #print record.content_length
                    #print record.headers
                    #print len(self.content)
                    #print repr(record.content[1])
                    #print endpoint
                    #print repr(response)
                    s += response + ' '
                self.out_file.write(s.rstrip().encode('utf-8')+'\n')
                #record.dump()
                stats['num_records_included'] += 1
            elif errors:
                raise ParseError(str(errors))
            else:
                pass # tail

        fh.close()

        if self.stats_file is not None:
            f = open(self.stats_file, 'w')
            json.dump(stats, f, indent=4)
            f.close()

예제 #22

0

파일 보기

파일: processWarcFiles.py 프로젝트: mohamed83/General

def main(argv):
#     if (len(argv) < 1):
#         print >> sys.stderr, "usage: processWarcDir.py -d <directory> -i <collection_id> -e <event> -t <event_type>"
#         sys.exit()
#         
#     if (argv[0] == "-h" or  len(argv) < 4):
#         print >> sys.stderr, "usage: processWarcDir.py -d <directory> -i <collection_id> -e <event> -t <event_type>"
#         sys.exit()
    #Done
    #argv = ["","/home/mohamed/IACollections/rem","","3647","","Texas Fertilizer Plant Explosion","","Accident"]
    #argv = ["","/home/mohamed/IACollections/3437","","3437","","Connecticut School Shooting","","Shooting"]
    #argv = ["","/home/mohamed/IACollections/2305","","2305","","Tucson Shooting","","Shooting"]
    #argv = ["","/home/mohamed/IACollections/2823","","2823","","Russia Plane Crash","","Plane_Crash"]
    #argv = ["","/home/mohamed/IACollections/2379","","2379","","Youngstown Shooting","","Shooting"]
    #argv = ["","/home/mohamed/IACollections/2772","","2772","","Norway Shooting","","Shooting"]
    #argv = ["","/home/mohamed/IACollections/694","","694","","April 16 Shooting","","Shooting"]
    #argv = ["","/home/mohamed/IACollections/2892","","2892","","Somalia_Bomb_Blast","","Bombing"]
    #argv = ["","/home/mohamed/IACollections/2838","","2838","","Nevada_AirRace_Crash","","Plane_Crash"]
    #argv = ["","/home/mohamed/IACollections/2822","","2822","","Texas_Wild_Fire","","Fire"]
    #argv = ["","/home/mohamed/IACollections/2882","","2882","","Encephalitis","","Disease_Outbreak"]
    #argv = ["","/home/mohamed/IACollections/2842","","2842","","China_Flood","","Flood"]
    #argv = ["","/home/mohamed/IACollections/2836","","2836","","Pakistan_Flood","","Flood"]
    #argv = ["","/home/mohamed/IACollections/3535","","3535","","Brazil_NightClub_Fire","","Fire"]
    #argv = ["","/home/mohamed/IACollections/2316","","2316","","Haiti_Earthquake_Anniversary","","Earthquake"]
    #argv = ["","/home/mohamed/IACollections/2406","","2406","","New_Zealand_Earthquake","","Earthquake"]
    #argv = ["","/home/mohamed/IACollections/2821","","2821","","Virginia_Earthquake","","Earthquake"]
    #Not Yet
    
    
    argv = ["","/home/mohamed/IACollections/2903","","2903","","Turkey_Earthquake","","Earthquake"]
    
    
    
    rootdir = argv[1]
    collection_id = argv[3]
    event = argv[5]
    event_type = argv[7]
    
    for root, subFolders, files in os.walk(rootdir):
        for filename in files:
            filePath = os.path.join(root, filename)
            if filename.endswith(".warc") or filename.endswith(".warc.gz"):# or filename.endswith(".arc.gz"):
                # processWarcFile(filePath, collection_id, event, event_type)
                splitext = filePath.split('.')
                output_dir = splitext[0] + "/"
                
                log_file = os.path.join(output_dir, filePath[filePath.rfind("/")+1:] + '.index.txt')
                
                # output_file = output_dir + filePath.split("/")[1] + ".index.txt"
                if os.path.exists(output_dir) == False:                    
                
                    os.makedirs(output_dir)
        
                    # unpackWarcAndRetrieveHtml(filePath, collection_id, event, event_type)
                    # output_dir = filePath.split(".")[0] + "/"
                    default_name = 'crawlerdefault'
                    wayback = "http://wayback.archive-it.org/"
                    collisions = 0
                        
                    #log_file = os.path.join(output_dir, filePath[filePath.rfind("/")+1:] + '.index.txt')
                    
                    log_fileh = open(log_file, 'w+b')
                    warcunpack_ia.log_headers(log_fileh)
                
                    try:
                        with closing(ArchiveRecord.open_archive(filename=filePath, gzip="auto")) as fh:
                            collisions += warcunpack_ia.unpack_records(filePath, fh, output_dir, default_name, log_fileh, wayback)
                
                    except StandardError, e:
                        print >> sys.stderr, "exception in handling", filePath, e
                        return
                
                    print "Warc unpack finished"
                
                html_files = parseLogFileForHtml(log_file)
                print "Log file parsed for html files pathes"
                
                
                # for i in html_files:
                    # extractTextAndIndexToSolr(i["file"], i["url"], i["wayback_url"], collection_id, event, event_type)
                extractTextAndIndexToSolr(html_files, collection_id, event, event_type)
                print "Storing and Indexing finished"