def read_and_clean_files(clueweb_file, ann_file, data_dir, ann_dir): """ Read file from data_dir and ann_dir, replace entity mentions and clean records in that file :param clueweb_file: :param ann_file: :param data_dir: Warc files directory :param ann_dir: Annotations directory :return: {'record_id': record_id, 'replaced_record': cleaned_replaced_record, 'cleaned_record': cleaned_record} """ annotation_input = fileinput.FileInput(os.path.join(ann_dir, ann_file), openhook=fileinput.hook_compressed) annotation_list = [] for line in annotation_input: annotation_list.append(Annotation.parse_annotation(line)) warc_path = os.path.join(data_dir, clueweb_file) warc_file = warc.open(warc_path) print "Replacing entity mentions for ", clueweb_file, ":", ann_file, "..." start = time.time() warc_entry = WarcEntry(warc_path, warc_file, annotation_list) cleaned_records = warc_entry.replace_entity_mentions() end = time.time() print "Time used: ", end - start warc_file.close() return cleaned_records
def handle_warc_file(warc_file_name): global PROCESS_STDOUT_HANDLE if warc_file_name.find('.warc.gz') < 0: return if PROCESS_STDOUT_FILENAME is None: initialize_output_files(0) w = warc.open(warc_file_name) try: for record in w: try: record_url_parsed = urlparse.urlparse(record['WARC-Target-URI']) host = record_url_parsed.hostname queries_string = record_url_parsed.query queries = [x.split('=')[0] for x in queries_string.split('&')] # grabs a list of keys for query in queries: print host, query # one line per host and query combination except: sys.stderr.write('Awkward record in ' + warc_file_name + '\n') continue except IOError: sys.stderr.write('Awkward record found in ' + warc_file_name + '\n')
def validate_file(warc_path): """Validate a CYOC.net grab warc file""" wf = warc.open(warc_path) for record in wf: if record.type == 'response': validate_record(record) return
def run(string): myindex = index.open_dir("index") writer = myindex.writer(procs=3, multisegment=True, limitmb=512) f = None try: f = warc.open(string) except : print "Can't open this file" sys.exit() i = 0 count = 1 datalist = [] print "start read record!" for record in f: i = i + 1 if i >= 2: # parser = MyHTMLParser() try: # data = striphtml(unicode(record.payload, errors='ignore')) # datalist.append(data) # if data == None : # print i parser = MyHTMLParser() parser.feed(unicode(record.payload, errors='ignore')) data = parser.getData().decode('utf8') # datalist.append(data2) parser.kill() if (i % 15000) == 0: print "write the data of %d" %(i) # for x in datalist: # writer.add_document(docId=count, content=x) # count += 1 print "commit now" start = time.time() writer.commit() stop = time.time() print "commit over ", (stop - start) writer = myindex.writer(procs=3, multisegment=True, limitmb=512) datalist = [] gc.collect() except Exception as e: print "error in the data of %d" %(i) print e.message, e.args print "------------------------" # print data #parser.kill() writer.add_document(docId=i-1, content=data) # for x in datalist: # writer.add_document(docId=count, content=x) # count += 1 print "final commit now" start = time.time() writer.commit() stop = time.time() print "final commit over", (stop - start)
def create_warc_from_corpus(documents, filename=None): """ Used mainly in tests to generate small .warc files """ if filename is None: fd, filename = tempfile.mkstemp(suffix=".warc") os.close(fd) f = warc.open(filename, "w") for doc in documents: headers = "Connection: close\r\nContent-Type: text/html" if "headers" in doc: headers = "\r\n".join(["%s: %s" % (k, v) for k, v in doc["headers"].iteritems()]) payload = "HTTP/1.1 200 OK\r\n" + headers + "\r\n\r\n" + doc["content"] record = warc.WARCRecord(payload=payload, headers={ "Content-Type": "application/http; msgtype=response", "WARC-Type": "response", "WARC-Target-URI": doc["url"] }) f.write_record(record) f.close() return filename
def process_facc1_with_filename(facc1file, cluewebfile): facc1_obj = open(facc1file, 'rb') clueweb_obj = warc.open(cluewebfile, 'rb') process_facc1_with_fileobj(facc1_obj, clueweb_obj) facc1_obj.close() clueweb_obj.close()
def _get_warc_file(self): """Creates a new Warc file""" assert self.warc_fp is None, "Current Warc file must be None" self.file_n += 1 fname = "{}.{}.warc.gz".format(self.fname_prefix, self.file_n) self.warc_fname = os.path.join(self.outdir, fname) self.warc_fp = warc.open(self.warc_fname, "w")
def parser(filename): try: warcfile = warc.open(filename) except Exception, e: os.system("cls") print "Can't find the " + filename os.system("pause") raise e
def _get_warc_file(self): '''Creates a new Warc file''' assert self.warc_fp is None, 'Current Warc file must be None' self.file_n += 1 fname = '%s.%s.warc.gz' % (self.fname_prefix, self.file_n) self.warc_fname = os.path.join(self.outdir, fname) self.warc_fp = warc.open(self.warc_fname, 'w')
def warc_records(string, path): """Iterates over warc records in path.""" for filename in os.listdir(path): if re.search(string, filename) and ".warc" in filename: print("parsing", filename) with warc.open(path + filename) as warc_file: for record in warc_file: yield record
def FilterOneFile(InName,OutName,hBlack): In = warc.open(InName) Out = warc.open(OutName,'w') cnt = 0 FilterCnt = 0 while True: try: record = In.read_record() except (AssertionError, EOFError) as e: break cnt += 1 if 'warc-trec-id' in record: if record['warc-trec-id'] in hBlack: FilterCnt += 1 continue Out.write_record(record) print '[%s] [%d/%d] filtered' %(InName,FilterCnt,cnt) return True
def handle_warc_file(warc_file): f = warc.open(warc_file) for record in f: if not core.is_response_record(record): continue soup = BeautifulSoup(record.payload.read()) if soup.text.find('IP.Board') >= 0: yield record.url
def open_warc(self): #Reset period self.period_start = time.gmtime() self.period_start_time = time.time() #Open the warc self.warc_filepath = utilities.generate_warc_filepath(self.data_dir, self.collection, warc_type=self.stream_name) utilities.create_warc_dir(self.warc_filepath) log.debug("Opening %s", self.warc_filepath) self.warc = warc.open(self.warc_filepath, "wb")
def handle_warc_file(warc_file, f): ''' Perform a function over every record in a warc file. Args: - warc_file : path/to/warc.gz file - f : function to run over every record. ''' warc_handle = warc.open(warc_file) for record in warc_handle: yield f(record)
def get_response_warcs(warc_file): ''' Returns a sequence of response records only in a warc_file Args: - warc_file : path/to/warc.gz file ''' warc_handle = warc.open(warc_file) for record in warc_handle: if record.type == 'response': yield record
def main(): targetId = 22466 #filename = "data/ClueWeb09_English_Sample.warc" filename = "data/10.warc.gz" warcfile = warc.open(filename) docId = 0 for doc in warcfile: if docId == targetId: print unicode(doc.payload, errors="ignore") elif docId > targetId: break docId += 1
def __iter__(self): for fpath in self.fpaths: f = warc.open(fpath) for record in f: if record.type == 'response': try: header = record.header content = record.payload.read() yield (header, content) except Exception as e: print(e) f.close()
def find_file(filename): # E.g. "clueweb12-0002wb-99-19011" root = filename.split("-")[1] folder = root[0:2] warcf = filename.split("-")[2] warc_path = "/clueweb12b/DiskB/ClueWeb12_" + folder + "/" + root + "/" + root + "-" + warcf + ".warc.gz" w = warc.open(warc_path) for reg in w: if "warc-trec-id" not in reg.header.keys(): continue if filename == reg.header["warc-trec-id"]: return reg.payload.read() return filename + "NOT FOUND"
def main(): targeturl = argv[1] filename = "%s-%s.warc" % (urlparse(targeturl).netloc, datetime.utcnow().strftime("%Y%m%d-%H%M")) print "Starting snapshot of %s, writing to %s" % (targeturl, filename) wf = warc.open(filename, "w") warcinfo_record = mkwarcinfo(filename) if DEBUG: print "Writing warcinfo record" wf.write_record(warcinfo_record) record = download(targeturl) if len(REQUESTS): request_record = REQUESTS.pop(0) if DEBUG: print "Writing request record %s" % request_record['WARC-Record-ID'] wf.write_record(request_record) if DEBUG: print "Writing response record %s" % record['WARC-Record-ID'] wf.write_record(record) else: if DEBUG: print "Writing response record" wf.write_record(record) # If the parser could parse the first resource, continue to download found # resources. Doesn't parse again, currently. Only grabbin images, css etc if DEBUG: print "Downloading linked content" for target in TARGETS: record = download(target) if len(REQUESTS): request_record = REQUESTS.pop(0) if DEBUG: print "Writing request record %s" % request_record['WARC-Record-ID'] wf.write_record(request_record) if DEBUG: print "Writing response record %s" % record['WARC-Record-ID'] wf.write_record(record) else: record = download(target) if DEBUG: "Writing response record." wf.write_record(record) if DEBUG: print "TARGETS ", TARGETS wf.close() print "Done."
def write_csv(in_name=None, out_name="warc_header.csv"): ret = False with open(out_name, 'wb') as csvfile: result = [] spamwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) with warc.open(in_name, 'rU') as f: for record in f: url = record['WARC-Target-URI'] rec_id = record['WARC-Record-ID'] warc_date = record['WARC-Date'] ip = record['WARC-IP-Address'] spamwriter.writerow([url, rec_id, ip, warc_date]) spamwriter.close() return True
def __init__(self, filepath): """ :param filepath: The filepath of the WARC file. """ self.filepath = filepath log.info("Writing to %s", self.filepath) #Create the directory filepath_parent = os.path.dirname(self.filepath) if not os.path.exists(filepath_parent): log.debug("Creating %s directory.", filepath_parent) os.makedirs(filepath_parent) #Open warc self._warc_file = ia_warc.open(self.filepath, "w")
def article_records(): for fn in args.warcs: n_records = n_errors = 0 print('Processing {}'.format(fn)) warc_file = warc.open(fn) for record in warc_file: n_records += 1 try: yield process_article(record) except AssertionError as e: n_errors += 1 logging.error('{}\t{}'.format(record.url, e)) warc_file.close() print('Records processed: {} ({} errors => {} inserted)'.format(n_records, n_errors, n_records - n_errors))
def _get_warc_file(self): '''Creates a new Warc file''' #assert self.warc_fp is None, 'Current Warc file must be None' if not self.hostname: return file_n = self.warc_file_n_slots.get(self.hostname) if not file_n: file_n = 0 file_n += 1 self.warc_file_n_slots[self.hostname] = file_n fname = '%s_%s.warc.gz' % (self.hostname, file_n) warc_fname = os.path.join(self.warc_dir, fname) assert os.path.exists(warc_fname) is not True warc_fp = warc.open(warc_fname, 'w') self.warc_fp_slots[self.hostname] = warc_fp return warc_fp
def FetchOneDirURL(InDir,OutName): lFName = WalkDir(InDir) out = open(OutName,'w') for InName in lFName: In = warc.open(InName) while True: try: record = In.read_record() except (AssertionError, EOFError) as e: break if ('warc-trec-id' in record) & ('warc-target-uri' in record): print >> out, record['warc-target-uri'] + '\t' + record['warc-trec-id'] print '[%s] finished' %(InName) out.close() print 'dir [%s] finished' %(InDir) return True
def list_records(filepath, services=()): print "File %s" % filepath f = warc.open(filepath) try: for record in f: if record.type == 'response': resp_record = to_response_record(record) if (not services or (isinstance(resp_record, ApiResponseRecord) and resp_record.service in services) or (not isinstance(resp_record, ApiResponseRecord) and "other" in services)): print "Record %s" % resp_record.record_id print "Url: %s" % resp_record.record_url print "Date: %s" % resp_record.date if isinstance(resp_record, ApiResponseRecord): print "Service: %s" % resp_record.service print "API method: %s (%s)" % (resp_record.api_method, resp_record.api_args) finally: f.close()
def getDocContent(trecid): '''function en cours''' princess_dir = '/osirim/sig/PROJET/PRINCESS' corpus_dir = '/osirim/sig/CORPUS/CLUEWEB12/ClueWeb12-Full' clue_web_dir = trecid[0:9] + '_' + trecid[10:12] #print clue_web_dir clue_web_dir = clue_web_dir.replace("clueweb","ClueWeb") warc_dir = trecid[10:16] #warc_file = warc_dir + '-' + trecid[19:21] + 'warc.gz' warc_file = corpus_dir+'/'+clue_web_dir+'/'+ warc_dir+"/"+warc_dir + '-' + trecid[17:19] + '.warc.gz' #docnum = trecid[20:] #print warc_file file = warc.open(warc_file) #loop through warc to get doc content for record in file: #print record.header if "WARC-Trec-ID" in record.header: # if record['WARC-TREC-ID'] == docNum: if record['WARC-Trec-ID'] == trecid: #print record.payload.read() html_doc = record.payload.read() #soup = BeautifulSoup(html_doc, 'html.parser') # vire bien les balises mais plante sur </br /> soup = BeautifulSoup(html_doc, 'html5lib') # peut garder des div, des a et img for script in soup(["script", "style","comment"]): script.extract() text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text_clean = '\n'.join(chunk for chunk in chunks if chunk) #txt_unicode = unicode(txt, 'utf-8') #txt_unicode = u""+txt #txt_unicode = txt.encode('utf8', 'replace') #print txt #return txt.encode('utf-8','ignore') text_clean = text_clean.replace(u"</br />","</br>") return text_clean #sys.exit(0) break
def run(string): f = None try: f = warc.open(string) except : print "Can't open this file" sys.exit() i = 0 error = 0 print "start read record!" for record in f: i = i + 1 if i > 2: data1 = striphtml(unicode(record.payload, errors='ignore')) if data1 == None: error = error + 1 if (i % 1000) == 0 : print error print i print error
def get_warc(url): global targeturl targeturl = url temp = tempfile.NamedTemporaryFile(mode='rb', suffix = '.warc') filename = "%s-%s.warc" % (urlparse(targeturl).netloc, datetime.utcnow().strftime("%Y%m%d-%H%M")) wf = warc.open(temp.name, "w") warcinfo_record = mkwarcinfo(filename) wf.write_record(warcinfo_record) record = download(targeturl) if len(REQUESTS): request_record = REQUESTS.pop(0) wf.write_record(request_record) wf.write_record(record) else: wf.write_record(record) #If the parser could parse the first resource, continue to download found #resources. Doesn't parse again, currently. Only grabbin images, css etc for target in TARGETS: record = download(target) if len(REQUESTS): request_record = REQUESTS.pop(0) wf.write_record(request_record) wf.write_record(record) else: record = download(target) wf.write_record(record) wf.close() binary = open(temp.name, 'rb').read() temp.close() return binary
def read_simplyhired_data(filename): f = warc.open(filename) for record in f: # Ignore DNS records if record['content-type'] == 'text/dns': continue http_headers, content = parse_warc_payload(record.payload.read()) soup = BeautifulSoup(content) if soup.select('div.detail .job_info'): # Get the job metadata and yield to iterator job_company = soup.select('div.detail .company')[0].text.strip().replace("Company: ", "") job_title = soup.select('div.detail .title')[0].text.strip() job_location = soup.select('div.detail .location')[0].text.strip().replace("Location: ", "") job_description = soup.select('div.detail .description_full')[0].get_text("\n", strip=True) # job_is_telecommute = soup.find_all("dt",text="Telecommute:")[0].find_next_sibling().text.strip() yield { "source": record.url, "company": job_company, "title": job_title, "location": job_location, "description": job_description }
def open_warc(self): """ Opens a new WARC file """ self.warc_file_path = os.path.join ( config.warcs_path, "apicrawler.%s.warc.gz" % ( datetime.datetime.now().strftime(config.datetime_format))) logger.info("Writing new WARC file: %s" % self.warc_file_path) self.warc_file = warc.open(self.warc_file_path, "w") # # Write WARCInfo record # warc_header = warc.WARCHeader( { "WARC-Type": "warcinfo", "Content-Type": "application/warc-fields", "WARC-Filename": os.path.basename(self.warc_file_path) }, defaults = True) warc_payload = 'software: apicrawler\nhostname: ia200127' warc_record = warc.WARCRecord(warc_header, warc_payload) self.warc_file.write_record(warc_record) self.warcinfo_id = warc_header['WARC-RECORD-ID'] logger.info("New WARC id: %s" % self.warcinfo_id)
from assessment.models import * import mongoengine import pymongo #mongoengine.connect(db=ntcir) con = pymongo.Connection('localhost', 27017) ntcir = con.ntcir topic = ntcir.ntcir table = ntcir.table table.insert({'table_id': '1', 'table': 'dog'}) table.insert({'table_id': '2', 'table': 'cat'}) table.insert({'table_id': '3', 'table': 'mouse'}) f = warc.open('/Users/Fan/Downloads/0000tw-00.warc.gz') i = 0 for record in f: h = record.header test = record.payload.read() WARC_Trec_ID = h.get("WARC-Trec-ID") i += 1 if i > 1: filepath = '/Users/Fan/Downloads/NTCIRMDB2/assessment/templates/' + str( WARC_Trec_ID) + '.html' with open(filepath, 'w') as d: d.write(test) d.close() topic.insert({ 'topic_id': '1', 'topic': 'dog',
with open("warc.paths", "r") as paths: # count = 0 # for path in paths: # path = pre + path # count += 1 # print(path) # print(count) # only process the first file for practice path = paths.readline() path = pre + path print(path) with warc.open( "CC-MAIN-20160205193905-00000-ip-10-236-182-209.ec2.internal.warc.gz", "r") as records: recordnum = 0 with open("warc_output.txt", "w") as fout: for record in records: # WARC records have three different types: # ["application/warc-fields", "application/http; msgtype=request", "application/http; msgtype=response"] # We're only interested in the HTTP responses if record.header[ 'content-type'] != 'application/http; msgtype=response': continue if recordnum < 1000000: try: content = record.payload.read().decode("utf-8")
# To run: python just_text.py > text ### from glob import glob # from nltk.corpus import stopwords import warc # List any of the WARC files found in the data folder warc_files = glob( "C:\DIC\Labs\Lab2\CC-MAIN-20190121172846-20190121194846-00444.warc.gz") # Process each of the WARC files we found files_processed = 0 for fn in warc_files: f = warc.open(fn) for record in f: url = record.header.get('warc-target-uri', None) if not url: continue text = record.payload.read() # print(url) print(text) def construct_dictionary(): pairs = [] for i in range(0, 3): # + str(i) with open("reducer_output_cc" + "//part-r-0000" + str(i), "r", encoding="utf-8") as f:
count = 0 # filename = '../dataset/'+domain+'.com'+str(i)+'.warc.gz' filename = '../samples/' + domain + '_sample.warc.gz' print 'Load' + filename try: with gzip.open(filename, 'rb') as gfz: ''' Load file ''' contents = [(warc.WARCRecord(payload=record.payload.read(), headers=record.header)) for record in warc.WARCFile(fileobj=gfz)] l = len(contents) except: continue ''' select records randomly ''' print 'select' f_sample = warc.open(filename_sample, 'a') while count < per_warc: rand = random.randint(0, l - 1) sys.stdout.write("\rRecord count %i" % count) sys.stdout.flush() r = contents[rand] #pre = preprocessing.HTMLPreprocessing(r.payload) payload = r.payload r['Content-Length'] = str(len(payload)) r['WARC-Record-ID'] = str(uuid.uuid4()) f_sample.write_record( warc.WARCRecord(payload=payload, headers=r.header)) count += 1 print '\n' f_sample.close()
import warc f = warc.open( '/home/tamaki/corpus/DiskB/ClueWeb12_08/0817wb/0817wb-49.warc.gz') i = 0 for record in f: h = record.header test = record.payload.read() WARC_Trec_ID = h.get("WARC-Trec-ID") i += 1 if i > 1: filepath = '/home/fanyimeng/test/' + str(WARC_Trec_ID) + '.html' with open(filepath, 'w') as d: d.write(test) d.close()
filetypes = ['.CSV', '.XLS', '.XLSX', '.JSON', '.RDF', '.ZIP'] geofiletypes = ('.GEOJSON', '.GML', '.GPX', '.GJSON', '.TIFF', '.SHP', '.KML', '.KMZ', '.WMS', '.WFS') filetypes.extend(geofiletypes) csvoutfile = open(sys.argv[1] + '.data.csv', 'a+b') datawriter = csv.writer(csvoutfile, delimiter=',') columns = [ 'Stadt_URL', 'URL_Datei', 'URL_Text', 'URL_Dateiname', 'Format', 'geo', 'URL_PARENT', 'Title_Parent' ] datawriter.writerow(columns) f = warc.open(sys.argv[2]) domain = sys.argv[1] blacklist = ('.jpg', '.gif', '.ico', '.txt', '.pdf', '.png', 'dns:', '.css', '.js') for record in f: if ('WARC-Target-URI' in record.header) and ( domain in record['WARC-Target-URI']) and not any( x in record['WARC-Target-URI'] for x in blacklist) and 'metadata' in record['warc-type']: #for item in record.__dict__['header'].items(): #print item for line in record.__dict__['payload'].read().split('\n'): if any(ext in line.upper() for ext in filetypes): url = line.split(' ')[1] extension = url.split('.')[-1].upper()
bucket = lsh.hash_tables[0].get_val(key) for query_object in bucket: candidates = lsh.query(query_object[0], distance_func='cosine') dedup.add(query_object[1]) for c in candidates: candidate_key = c[0][ 1] # warc id is appended as extra data in lsh.index() if candidate_key == query_object[1]: continue candidate_distance = c[1] if float(candidate_distance) >= threshold: dedup.add(candidate_key) elif candidate_key in dedup: dedup.remove(candidate_key) file = warc.open(filenameIn + '_dedup.warc.gz', 'wb') numSingle = len(dedup) for i in range(0, max_files): with gzip.open(datasetPath + filenameIn + str(i) + '.warc.gz', mode='rb') as gzf: for record in warc.WARCFile(fileobj=gzf): record_id = record['WARC-Record-ID'] if record_id in dedup: payload = record.payload.read() file.write_record( warc.WARCRecord(payload=payload, headers=record.header)) print 'Total pages: ' + str(doc_count) print 'Pages after deduplication: ' + str(numSingle) file.close()
matplotlib.use('TkAgg') import matplotlib.pyplot as plt import collections import argparse argparser = argparse.ArgumentParser( description= 'processes warc files and returns pie chart with domains distribtuion') argparser.add_argument('--input', required=True, help='Path to the warc file') argparser.add_argument('--output', required=True, help='Path to the output image of pie chart') args = argparser.parse_args() domains_counter = collections.Counter() with warc.open(args.input) as f: for record in f: if 'WARC-Target-URI' not in record: continue else: host_with_port = urlparse(record['WARC-Target-URI']).netloc host = host_with_port.split(':')[0] domain = host.split('.')[-1] domains_counter.update([domain]) common = dict(domains_counter.most_common(10)) domains = list(common.keys()) total = sum(common.values()) shares = [common[domain] / total for domain in domains] fig, ax = plt.subplots()
import urllib.request import warc if __name__ == '__main__': f = warc.open("test.warc.gz") for record in f: print(record['WARC-Target-URI'], record['Content-Length'])
def main(): """ main routine """ if not os.path.exists(LEXICON_PATH): try: os.mkdir(LEXICON_PATH) except Exception: print 'Error making %s, exiting.' % LEXICON_PATH return wet_files = get_wet_files() docid_gen = docid_generator() url_index = UrlIndex() word_index = WordIndex() for wet_file in wet_files: print wet_file wet_fd = warc.open(wet_file) doc_next_offset = 0 lexicon_file = os.path.join( LEXICON_PATH, '.'.join(os.path.basename(wet_file).split('.')[:-1] + ['lexicon'])) lex_fd = open(lexicon_file, 'wb') for wet_record in wet_fd: if wet_record.url: docid = docid_gen.next() url = wet_record.url url_lens, url_fileid, url_offset = url_index.write_url_index( url) doc_fileid = 88 doc_offset = doc_next_offset if doc_next_offset else 0 doc_header_length = wet_record.payload.fileobj.tell( ) - doc_offset doc_length = doc_header_length + wet_record.header.content_length content_offset = doc_header_length content_length = wet_record.header.content_length print docid, url, (url_fileid, url_offset, url_lens), \ (doc_fileid, doc_offset, doc_length, content_offset, content_length) # docid(4B), url_pos[fileid(2B), offset(4B), lens(2B)], # doc_pos[fileid(2B), offset(4B), lens(4B), con_offset(2B), con_lens(4B)] url_idx_data = pack('=IHIHHIIHI', \ docid, url_fileid, url_offset, url_lens, doc_fileid, \ doc_offset, doc_length, content_offset, content_length) url_index.write_url_index_entry(url_idx_data) # generate lexicons saved_offset = wet_record.payload.fileobj.tell() page_content = wet_record.payload.fileobj.read(content_length) wet_record.payload.fileobj.seek(saved_offset) for token, start, end in split_with_offset(page_content): if is_ascii(token) and len(token) > 0 and len(token) < 256: word_id = word_index.add_entry(token) lexicon_data = pack('iiih', word_id, docid, start, 2) lex_fd.write(lexicon_data) doc_next_offset = wet_record.payload.fileobj.tell( ) + wet_record.payload.length lex_fd.close() wet_fd.close()
# prefix= "https://commoncrawl.s3.amazonaws.com/" # exurl ="crawl-data/CC-MAIN-2019-13/segments/1552912202658.65/warc/CC-MAIN-20190322115048-20190322141048-00227.warc.gz" # finalurl= prefix+exurl # finalurl = finalurl.replace('/warc/','/wet/').replace('.warc.','.warc.wet.') # wget.download(exurl) keywords= 'trump' or 'president' or 'government' or 'party' or 'people' or 'election' or 'state' or 'house' or 'political' \ or 'politics' or 'republican' or 'vote' or 'administration' count = 0 # exurl ="https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-13/segments/1552912202658.65/wet/CC-MAIN-20190322115048-20190322141048-00227.warc.wet.gz" exurl2 = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-13/segments/1552912203021.14/wet/CC-MAIN-20190323201804-20190323223804-00227.warc.wet.gz" exurl3 = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-13/segments/1552912201672.12/wet/CC-MAIN-20190318191656-20190318213656-00398.warc.wet.gz" # wget.download(exurl) records = warc.open("D:\MS\\2ndSem\DIC\Lab2\\crawlData\\text.warc.wet") newfile = open("D:\\MS\\2ndSem\\DIC\\Lab2\\crawlData\\crawl2.txt", 'a') for record in records: url = record.header.get('WARC-Target-URI', None) if url: text = record.payload.read() try: if (count < 520 and detect(text.decode("utf-8")) == 'en'): if keywords in text.decode("utf-8").lower(): newfile.write(str(text, 'utf-8')) newfile.write('\n\n') count = count + 1 print(count) except Exception: pass
from modules import NumberGenerator docIDDigits = 4 frequancyDigits = 2 docIdGenerator = NumberGenerator.Number(digits=docIDDigits, after=args.startID) r = redis.Redis(unix_socket_path=args.redis, db=args.redisDB) if args.docIDwet: docIDwetFile = open(args.docIDwet, mode='a') else: docIDwetFile = sys.stderr for filepath in args.files: print("* Dealing:", filepath, file=sys.stderr) with warc.open(filepath, 'rb') as f: for (record, offset, _) in tqdm(f.browse(), unit='records'): URI = record.url if URI: content = record.payload.read() if content: (lang, langConfidence) = Language.classify(content) if lang in space_devided_langs: words = latin_sep_words.split(str(content)) elif lang == 'zh' and not args.skipChinese: words = jieba.cut(content, cut_all=False) # words = list(words) words = [word for word in words if non_latin_words_pattern.match(word)] else: # other languages continue
folderidx =sys.argv[1] foldername = '/home/cluo/publicdata/DiskB/ClueWeb12_'+folderidx pages = set() for l in open('../ref/enpages.txt').readlines(): pages.add(l.strip()) for folder in os.listdir(foldername): subfoldername = foldername+'/'+folder for filename in os.listdir(subfoldername): print subfoldername,filename os.system('7z e '+subfoldername+'/'+filename) warcfilename = filename.replace('.7z','') f = warc.open(warcfilename) for record in f: id = record.header.get('WARC-TREC-ID') if id in pages: fout = open('../data/parse/'+str(record.header.get('WARC-TREC-ID')+'.html'),'w') content = record.payload.read() write = False for l in content.split('\n'): if '<!DOCTYPE' in l: write=True if write == True: fout.write(l+'\n') fout.write(content) fout.close() os.system('rm '+warcfilename)
import warc import uuid import sys import os import gzip os.chdir('/home/eckel/') ''' Load and preprocess data ''' print 'preprocessing' filenameIn = sys.argv[1] max_range = int(sys.argv[2]) for i in range(0, max_range): print filenameIn + str(i) fw = warc.open('dataset_id/' + filenameIn + str(i) + '.warc.gz', 'wb') with gzip.open('dataset/' + filenameIn + '.com' + str(i) + '.warc.gz', mode='rb') as gzf: for record in warc.WARCFile(fileobj=gzf): record['WARC-Record-ID'] = str(uuid.uuid4()) fw.write_record( warc.WARCRecord(payload=record.payload.read(), headers=record.header)) fw.close()
# -*- coding: utf-8 -*- """ Created on Mon Nov 21 10:20:14 2016 @author: ZJun """ import warc f = warc.open("00.warc.gz") f.read_record() i = 0 for record in f: i = i + 1 print record['WARC-Target-URI'], record['Content-Length'] if i > 10: break f.close() f = warc.open("00.warc.gz") a = f.read_record() b = f.read_record() ''' a.header.items() Out[52]: [('warc-type', 'warcinfo'), ('content-length', '219'), ('version', '0.18'), ('warc-date', '2009-03-75T00:59:24-0400'), ('content-type', 'application/warc-fields'), ('warc-record-id', '<urn:uuid:b38cd8ab-5ba6-445c-9c9c-0a5cbc3b6a41>')]
from __future__ import print_function import warc import sys if len(sys.argv) != 2: print("Usage: {} <input_file>".format(sys.argv[0])) sys.exit(0) f = warc.open(sys.argv[1]) for record in f: print(record.url) # record.header => Dictionary containing the header field names and their values # record.payload => Payload object, can be read with a.payload.read()
import warc from textblob import TextBlob import string import urllib import os commonPath = 's3://aws-publicdatasets/' for line in sys.stdin: #download file url = commonPath + line os.system('aws s3 cp ' + url + './file.warc.wet.gz') print 'wet file downloaded' #open the warc file f = warc.open('file.warc.wet.gz') txtFile = open('data/' + line[70:140] + '.txt', 'w') #read the warc file and write to text file for record in f: if record['Content-Type'] == 'text/plain': date = record['WARC-Date'] htmlText = record.payload.read() text = htmlText.strip() text = text.translate(None, '[!@#$+:|/\#%^*()-_=~]\n') printableText = filter(lambda x: x in string.printable, text) blob = TextBlob(printableText) for sentence in blob.sentences: if len(sentence) <= 340: txtFile.write(date[0:10]) txtFile.write('\t')
#https://gist.github.com/Smerity/afe7430fdb4371015466 #https://pypi.org/project/langdetect/ #!pip install warc3-wet import warc import wget from langdetect import detect exurl = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-13/segments/1552912202658.65/wet/CC-MAIN-20190322115048-20190322141048-00227.warc.wet.gz" wget.download(exurl, out="../Data/Commoncrawl") from sh import gunzip gunzip( "../Data/Commoncrawl/CC-MAIN-20190322115048-20190322141048-00227.warc.wet.gz" ) records = warc.open( "../Data/Commoncrawl/CC-MAIN-20190322115048-20190322141048-00227.warc.wet") counter = 0 data = [] urls = set() keywords = [ 'soccer', 'basketball', 'sports', 'baseball', 'score', 'hockey', 'mls', 'nba', 'ncaa', 'nfl', 'knicks', 'mma', 'nhl', 'golf' ] for record in records: if counter == 600: break else: url = record.header.get('warc-target-uri', None) if not url: continue if url not in urls: urls.add(url)
def process(self, item): # assert that this item is flagged for sampling. If not, # return immediately. We don't want to butcher uploads that # have been determined to be worth saving in their original # state. # # Presumably, the tracker is tagging these items as something # appropriate. Alternately, one could create a "Phase 3" grab # and know for a fact that we are only receiving videos that # should be sampled. In which case, one may skip the item_type # check and proceed directly to sampling. item_name = item['item_name'] item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('video-bulk', 'url-bulk') # Item type is not marked as "video-bulk" from tracker. # Carry on. Nothing to do here. if item_type != 'video-bulk' or 'url-bulk': return # ok. This is an item that needs to be sampled. # remember where we started from so we can get back there and # not mess up the expectations for the rest of stages in the # pipeline original_path = os.getcwd() # get to item_dir ; begin work os.chdir(item['item_dir']) # we will need some data from the warcfile warcinfo_record_ID = "" metadata_record_ID = "" truncated_record_ID = "" # set up old and new warc files for reading and writing, respectively. # If a file ends in *.gz for writing, the warc library handles gz # compression transparently. old_warc_file = warc.open("%(warc_file_base)s.warc.gz" % item) new_warc_file = warc.open( "%(warc_file_base)s-POSTPROCESSED.warc.gz" % item, "w") # ------------------------ Start of main for loop -------------------# # and here... we... go for record in old_warc_file: # Firstly, we detect whether the record we're iterating over holds # data we'll need later. If so, behave appropriately. After the # if-elif-elif dance, we proceed to copy each record into a new # record in the %(warc_file_base)s-POSTPROCESSED.warc.gz file, # modifying as necesary (truncated long records, etc) # ------------------------ Check for data -------------------------# # Grab the lengthy payload (the flv file); if the content-length is # longer than ~5MiB, and the record is of the "response" type, then # this record *probably* has the flv file. if ((long(record['Content-Length']) >= 5000000) and record['WARC-Type'] == "response"): # need the record id of the original flv record. Will refernece # it in truncated record. truncated_record_id = record['warc-record-id'] # add "WARC-Truncated" to this record, indicating that it has # been truncated due to length. record['warc-truncated'] = "length" # extract the payload tempfile = open("intermediate.int", 'wb') for line in record.payload: tempfile.write(line) tempfile.close() # put the payload back; iterating through record.payload # invokes a generator on the payload that seems to # "eat up" the payload in the original file. I say so because # attempting to, say, write the payload out twice (to TWO files) # will fail, as will any attempt to read out the payload again # without first "putting it back." (I'd love an explanation for # just what's going on here; but for now, this hack works) # (for the record with the long content-length, we end up reading # the payload twice; once here, to get it to a separate file, and # once again, in COPY PAYLOAD, to write out a truncated version to # the new warc file) stream = StringIO(open("intermediate.dat", 'rb').read()) stream.seek(0, os.SEEK_END) streamlength = stream.tell() stream.seek(0) record.payload = warc.utils.FilePart(fileobj=stream, length=streamlength) # can't close the stream yet for some reason. This might # introduce leaks of some sort, so keep an eye on it. # The relevant error: "IO Operation on a closed file." # I suspect this operation occurs somewhere in the warc library, # and i'm hoping that the stream object just falls out of scope # at some point other than when the entire pipeline shuts down. # stream.close() # Adjust the warcinfo record to note that we also utilized ffmpeg elif (record['WARC-Type'] == "warcinfo"): # grab the record-id for later use in resource records warcinfo_record_ID = record['warc-record-id'] # gotta add another "software" key to the content-block of the # warcinfo record that indicates the use of ffmpeg. warcinfo_stream = StringIO() for line in record.payload: warcinfo_stream.write(line) # trailing \r\n\r\n is already present in the payload; just seek back # two bytes (yes, the second \r\n will get clobbered; potential unicode # byte-length issues here) and then tack on the additional lines you # need to like so: warcinfo_stream.seek(-2, os.SEEK_END) warcinfo_stream.write("software: ffmpeg/2.3.1\r\n\r\n") warcinfo_stream.seek(0, os.SEEK_END) warcinfo_stream_len = warcinfo_stream.tell() warcinfo_stream.seek(0) record.payload = warc.utils.FilePart( fileobj=warcinfo_stream, length=warcinfo_stream_len) # Get the metadata record's warc-record-id for later resource # records. elif (record['WARC-Type'] == "metadata"): metadata_record_ID = record['warc-record-id'] # End of conditionals. Proceed to write the new record to the # post-processed warcfile. # ------------------------ Copy Record -------------------------# # COPY HEADER # Should we add defaults=False ? It seems that some additional headers # are added in WARCHeader as well as WARCRecord. However, they don't # seem harmful: digests and timestamps. new_header = warc.WARCHeader(record.header) # COPY PAYLOAD # if the current record gets truncated, then set the content-length # to the new, truncated length as per spec. truncated_flag = None # SHORT record payloads if long(record['content-length']) < 500000: #print "Copying payload..." new_payload = StringIO() for line in record.payload: new_payload.write(line) #if we don't seek back to 0, new_payload.read() is empty new_payload.seek(0) #print "Done copying payload." # LONG record payloads (the one that probably has video data) else: #print "Found long content-length. Truncating..." new_payload = StringIO() decrement = 25 #Grab some lines #print "Gonna grab some lines. Decrement: ", decrement for line in record.payload: #print "Grabbing a line." new_payload.write(line) decrement -= 1 #print "Decrement: ", decrement if decrement == 0: break # be kind: rewind new_payload.seek(0) truncated_flag = True #print "Done truncating." # CREATE RECORD FROM HEADER AND PAYLOAD new_rec = warc.WARCRecord(payload=new_payload.read(), headers=new_header, defaults=False) # if this record happened to be one that got truncated, then we # need to adjust its content-length header. if truncated_flag: #print "Adjusting content-length header" # From page 9 of the ISO WARC Standard: # # "The WARC-Truncated field may be used on any WARC record. The WARC # field Content-Length shall still report the actual truncated size of # the record block." # Get the length of the truncated content-block and set # Content-Length header appropriately new_payload.seek(0) new_payload.seek(0, os.SEEK_END) thelength = new_payload.tell() new_rec['content-length'] = str(thelength) new_payload.seek(0) # WRITE THE NEW RECORD OUT TO THE NEW WARCFILE # (the warc library handles the gz-compression and putting each record # in a separate gz "member" transparently; no need to much with the gzip # library ourselves) #print "Copying record to new .warc.gz" new_warc_file.write_record(new_rec) #print "Done copying record to new .warc.gz" #print "\n\n" #------------------------ END OF MAIN FOR LOOP ------------------------# # at this point, we have a new warcfile with copied and truncated # records; now, we need to sample the content and add these "conversion" # records to the warc file. # Should probably delete old warc at this point, since new warcfile has all # of the old records, and we've already got another copy of the main # payload. If we proceed to write out the full newfile with the shrunken # payload before deleting the old warc, we'll basically be using nearly # 3x the interim diskspace rather than 2x. (Don't get me wrong, I'd love # to have more of a generator-like setup that negates the need to use # twice the disk space, but it's beyond the scope of my abilities at the # moment and I don't think I'd be able to get up to speed before the # deadline for this project drops (August 27 2014) Update: LOL Twitch is # already deleting things on August 26; oh well, I suppose this code # could come in handy if the IA suddenly needs to compress lots of # material) # Now, we need to convert the flv, and add conversion records # Our "payload.flv" is not quite an flv yet; the payload still includes the # HTTP Response headers. We need to grep for "CRLFCRLF" and then chop off # anything prior to it, including it, leaving nothing but the flv file for # ffmpeg to work with. thefile = open("intermediate.int").read() # NOT A FILE; just a "str" theflv = thefile.split('\r\n\r\n')[1] writetheflv = open("samplethis.flv", "w") writetheflv.write(theflv) writetheflv.close() # Get Snapshots SnapShot() # Get shrinked video ShrinkRay() # Clean up print( "********************* \n\n Removing temporary files; cleaning up \n\n*********************" ) # remove original file intermediates: "intermediate.int" and "samplethis.flv" rmargs = shlex.split("rm intermediate.int samplethis.flv") call(rmargs) # And we're done! new_warc_file.close() os.chdir(original_path)
import warc import sys from tqdm import tqdm print("** Test 1") print("*Pass 1") offsets = [] with warc.open( 'data/wet/CC-MAIN-20170919112242-20170919132242-00000.warc.wet.gz', 'rb') as f: i = 0 for record, offset, length in f.browse(): if i > 3: break URI = record.url print(i, offset, length, URI) if URI: offsets.append(offset) i += 1 print("seeks:", offsets) print("*Pass 2") with warc.open( 'data/wet/CC-MAIN-20170919112242-20170919132242-00000.warc.wet.gz', 'rb') as f: i = 2 f.seek(offsets[i]) for record, offset, length in f.browse(): if i > 5: break
import warc import uuid import os os.chdir('/home/eckel/') f = warc.open("samples/overstock_sample.warc.gz", "rb") fw = warc.open("overstock_test.warc.gz", "wb") count = 0 for record in f: if record[ 'WARC-Record-ID'] == '2dd726fe-5f11-43c3-a02c-47860e668cac' or record[ 'WARC-Record-ID'] == '4b3e1e5f-9ac3-4619-b784-a093a1d1ac0d': payload = record.payload.read() record_header = record.header fw.write_record(warc.WARCRecord(payload=payload, headers=record.header)) fw.write_record(warc.WARCRecord(payload=payload, headers=record.header)) fw.write_record(warc.WARCRecord(payload=payload, headers=record.header)) fw.write_record(warc.WARCRecord(payload=payload, headers=record.header)) #elif count < 2: # payload = record.payload.read() # record_header = record.header # fw.write_record(warc.WARCRecord(payload=payload,headers=record.header)) # count += 1 f.close() fw.close()
import urllib.request import warc if __name__ == '__main__': urls = [ 'https://elpais.com/', 'https://elpais.com/tag/gente/a', 'https://politica.elpais.com/', 'https://elpais.com/internacional/' ] f = warc.open("test.warc.gz", "w") for u in urls: fp = urllib.request.urlopen(u) mybytes = fp.read() mystr = mybytes.decode("utf8") fp.close() header = h = warc.WARCHeader({"WARC-Type": "response"}, defaults=True) header['WARC-Target-URI:'] = u record = warc.WARCRecord(header, mybytes) f.write_record(record) f.close() for u in urls: f = warc.open("test_trozos.warc.gz", "a") fp = urllib.request.urlopen(u)
pickle.dump(text_information, writer) warc_path_prefix = os.path.abspath(sys.argv[1]) file_path = os.path.abspath(sys.argv[2]) warc_records = {} counter = 0 part_number = 1 for warc_path in glob.glob(warc_path_prefix + "*"): if not warc_path.endswith("warc") and not warc_path.endswith( "warc.gz") and not warc_path.endswith("warcs.tgz"): print("skipped", warc_path) continue try: f = warc.open(warc_path) print("reading", warc_path) for record in f: try: target_uri = record["WARC-Target-URI"] html_text = record.payload.read() try: html_text = html_text.decode("utf-8", "ignore") except: pass warc_records[target_uri] = html_text except: pass counter += 1 if counter % 10000 == 0:
import warc f = warc.open("C:\Users\wouter\Documents\GitHub\\node-warc\warc\crawl0.warc") for record in f: print record['Content-Length']
def extract_sentences(clueweb_directory, freebase_directory): sub_directories = os.listdir(freebase_directory) for directory in sub_directories: clueweb_subdir = os.path.join(clueweb_directory, directory) if not os.path.isdir(os.path.join(clueweb_subdir)): sys.stderr.write("%s is not a directory\n" % (directory)) continue freebase_subdir = os.path.join(freebase_directory, directory) if not os.path.isdir(os.path.join(freebase_subdir)): sys.stderr.write("%s is not a directory\n" % (directory)) continue leaf_files = os.listdir(freebase_subdir) for leaf_file in leaf_files: warc_file = os.path.join( clueweb_subdir, leaf_file.replace(".anns.tsv", ".warc.gz")) annotated_file = os.path.join(freebase_subdir, leaf_file) if not os.path.exists(warc_file): sys.stderr.write("Skipped: warc file does not exist: %s" % (warc_file)) continue if not os.path.exists(annotated_file): sys.stderr.write("Skipped: annotated file does not exist: %s" % (annotated_file)) continue warc_file_reader = warc.open(warc_file).__iter__() # records = [record for record in warc_file_reader] annotated_file_reader = open(annotated_file) current_record = None current_record_count = -2 current_record_content = "" start_sentence = 0 end_sentence = 0 sys.stderr.write("%s %s\n" % (warc_file, annotated_file)) entities_and_sentence = [] for line in annotated_file_reader: line = line.rstrip() columns = line.split('\t') entity_tag = columns[-1] if entity_tag.startswith("/m/"): record_number = int(columns[0].split("-")[-1]) encoding = columns[1].lower() entity_name = columns[2] start_offset = int(columns[3]) end_offset = int(columns[4]) score = float(columns[5]) # Get the current record while current_record_count < record_number: try: current_record = warc_file_reader.next() except: current_record_count = -100 break current_record_count += 1 if current_record_count == record_number: try: current_record_content = current_record.payload.decode( encoding, 'replace').encode('utf-8', 'replace') except: current_record_count = -100 break if current_record_count < 0: break if current_record_content[ start_offset:end_offset] != entity_name: continue '''if len(records) <= record_number + 1: break current_record_content = records[record_number + 1].payload.decode(encoding, 'replace').encode('utf-8', 'replace')''' # Get the current sentence if start_sentence <= start_offset and end_sentence >= end_offset: entities_and_sentence.append([ entity_tag, score, start_offset - start_sentence, end_offset - start_sentence ]) else: if entities_and_sentence != []: print json.dumps(entities_and_sentence) entities_and_sentence = [] [start_sentence, end_sentence] = get_sentence(current_record_content, start_offset, end_offset) if start_sentence != -1: entities_and_sentence.append( current_record_content[ start_sentence:end_sentence]) entities_and_sentence.append([ entity_tag, score, start_offset - start_sentence, end_offset - start_sentence ])
body = BeautifulSoup(html, "html.parser").find("body") if not body: return None # now strip HTML we don't like. for tag in body.findAll(): if tag.name.lower() in blacklist: tag.extract() elif not tag.name.lower() in whitelist: tag.name = "span" tag.attrs = [] return body.get_text() f = warc.open(random_file()) domains_stat = {} language_stat = {} num_undetect_lang = 0 counter = 0 for record in f: if record.type == 'response': #and record.http_headers.get_header('Content-Type') == 'text/html': counter += 1 content = record.payload.read() text = plain_text(content) if text: try: lang = detect(text) if lang not in language_stat: language_stat[lang] = 1
from pyspark.sql import SparkSession from pyspark import SparkContext spark = SparkSession.builder.master("local").getOrCreate() sc = SparkContext.getOrCreate() #simple spark # rdd = sc.parallelize(["hello world"]) # count = rdd.flatMap(lambda x: x.split(' ')).map(lambda word: (word, 1)).reduceByKey((lambda a, b: a+b)) # output = count.collect() # print(output) #wordcount # data = sc.textFile("./10.warc.gz") # count = data.flatMap(lambda x: x.split(' ')).map(lambda word: (word, 1)).reduceByKey((lambda a, b: a+b)) # output = count.collect() # output #read warc and write to txt import warc f = warc.open("10.warc.gz") inf = open("output06.txt", "a") for record in f: inf.write(str(record)) inf.close() f.close() #
else: flagCheckFiles = 1 if flagCheckFiles == 1: if quietMode == False: print "\n The given list contains one or more files that are not in valid WARC format" if len(dirTree) == 0: if quietMode == False: print('\n No WARC files found in given list \n') sys.exit(0) outputPath = Ddir # generate new warc file name newFile = timeStampedFilename("WARCMerge") + '.warc' newFileFullPath = outputPath + '/' + newFile filePtr = warc.open(newFileFullPath, "w") outputFilesList.append(newFileFullPath) flag = 0 outputFileSize = os.path.getsize(newFileFullPath) / forConvertToMB # Sorting files by sizes sortFiles(dirTree) if quietMode == False: print print 'Merging the following WARC files: ' print '----------------------------------: ' for warcFile in dirTree: outputFileSize = os.path.getsize(newFileFullPath) / forConvertToMB
import io import time import nltk import os import json from nltk.tokenize import word_tokenize from nltk.corpus import stopwords nltk.download('stopwords') nltk.download('punkt') if not os.path.isdir("./output"): os.mkdir("./output") punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~\n=''' count = 1 output = {} f = warc.open("01.warc.gz") for record in f: if record.type == 'response': print "21 record.type: ", record.type, " , record.url: " , record.url r = requests.get(record.url, verify=False) read = r.text.encode('ascii', 'ignore').lower() print "\n\n\n24 ",type(read)," read: ", read text_tokens = word_tokenize(read) nltk_tokens = [ word for word in text_tokens if not word in stopwords.words()] print "\n\n\n27 nltk_tokens: ",nltk_tokens #tmp = sorted(set(tmp),key=tmp.index) ordered_tokens = set() tokens_without_sw = [] for word in nltk_tokens: if word not in ordered_tokens: ordered_tokens.add(word)
q = norm.pdf(x, 1, 1) with gzip.open("C:\ClueWeb09\2008-10-01", "rb") as f1: file_content = f1.read() x = f1.readline() trecCW9 = norm.pdf(x, 0, 2) q = norm.pdf(x, 1, 1) # # clue_web_01_08_df = pd.read_csv('C:\ClueWeb09\2008-01-01\en0000\01.warc.gz', compression='gzip', # header=1, sep='\t', quotechar='"') # Access the processed warc files based off of indexes from Indri toolkit f2 = warc.open("C:\Information Retrieval\ClueWeb09\2008-01-01\en0000\01.warc.gz") for record in f2: print (record['Trec-'], record['Content-Length']) f3 = warc.open("C:\Information Retrieval\ClueWeb09\2008-02-01\en0000\01.warc.gz") for record in f3: print (record['Trec-'], record['Content-Length']) f4 = warc.open("C:\Information Retrieval\ClueWeb09\2008-03-01\en0000\01.warc.gz") for record in f4: print (record['Trec-'], record['Content-Length']) f5 = warc.open("C:\Information Retrieval\ClueWeb09\2008-04-01\en0000\01.warc.gz") for record in f4: print (record['Trec-'], record['Content-Length'])