def articles(wiki_json_fn, limit=None): count = 0 _, ext = os.path.splitext(wiki_json_fn) if ext == '.gz': f = GzipFile(wiki_json_fn, mode='r') elif ext == '.bz2': f = BZ2File(wiki_json_fn, mode='r') else: f = io.open(wiki_json_fn, mode='rb') while True: line = f.readline() if line == b'': break action = json.loads(line.decode('utf-8')) line = f.readline() if line == b'': break source = json.loads(line.decode('utf-8')) if is_page(action, source): yield {'id': action['index']['_id'], 'title': source['title'], 'text': source['text']} count += 1 if limit and count > limit: return if count % 10000 == 0: logging.info("read %d articles" % count) f.close()
def _parseSingleFile(self, logname): # get the log try: log, headers = urllib.urlretrieve(logname) except IOError: log = logname try: fp = GzipFile(log) fp.readline() fp.seek(0) except IOError: fp = open(log, "rb") # parse the log parser = self.parsers.get( self.harnessType, TinderboxParser)(includePass=self.includePass) lineno, results = parser.parse(fp) fp.close() # dump output results.update({'filename': os.path.basename(logname)}) # create a sha1 hash to be this json's id m = hashlib.sha1() m.update(json.dumps(results)) id = "" if 'starttime' in results: id += str(results['starttime']) + '-' id += m.hexdigest() results.update({'id': id}) return results
class Reader(object): """Reader class""" def __init__(self, wikidata_dump_fn, decoding="utf-8"): """ :param wikidata_dump_fn: wikidata dump filename :param decoding: encoding used """ self.wikidata_dump_fn = wikidata_dump_fn self.decoding = decoding self.dump = GzipFile(wikidata_dump_fn, 'r') self.line = self.dump.readline() def has_next(self): """ Check if there is still entries to be read :return: true if other entries available """ self.line = self.dump.readline().decode(self.decoding) if self.line is '': return False else: return True def next(self): """ Return the next entry :return: next entry """ try: return json.loads(self.line.strip('\n,')) except json.decoder.JSONDecodeError as e: return None
def parse_file(log): print log try: fp = GzipFile(log) fp.readline() fp.seek(0) except IOError, e: fp = open(log, "rb")
def _cat(data, dups, fin, N, n, grep, encoding=None, filter_out=False): bom = fin.read(2) need_newline = True if bom == codecs.BOM_UTF16_LE: fin = codecs.EncodedFile(fin, 'utf-8', 'utf-16-le') elif bom == codecs.BOM_UTF16_BE: fin = codecs.EncodedFile(fin, 'utf-8', 'utf-16-be') elif bom == codecs.BOM_UTF32_LE: fin = codecs.EncodedFile(fin, 'utf-8', 'utf-32-le') elif bom == codecs.BOM_UTF32_BE: fin = codecs.EncodedFile(fin, 'utf-8', 'utf-32-be') elif bom == '\x1f\x8b': if N: raise ValueError('Tail is not supported for GZip files') fin.seek(0) fin = GzipFile(mode='r', fileobj=fin) elif encoding is not None: fin = codecs.EncodedFile(fin, 'utf-8', encoding) fin.seek(0) need_newline = False else: need_newline = False fin.seek(0) if need_newline: fin.readline() if N: data += tail(fin, N, grep, filter_out) elif grep or n: for line in fin: line = line.rstrip('\n') matches = None if grep: matches = grep.search(line) if not grep or (not filter_out and matches) or \ (filter_out and not matches): if matches: groups = matches.groups() if groups: record = '\t'.join(groups) if record not in dups: data.append(record) dups.add(record) else: data.append(line) else: data.append(line) if n and len(data) >= n: break else: block_size = 4 * 8192 block = fin.read(block_size) if len(block) == block_size: block += "\n[FILE TRUNCATED, USE DOWNLOAD]" data.append(block)
def gunzip (gzFileName, fileName): inFile= GzipFile (gzFileName) outFile= open (fileName, "w+") line= inFile.readline () while line: # print line outFile.write (line) line= inFile.readline () inFile.close () outFile.close ()
def test_group_events_together(self) -> None: buffer = MatchedEventsBuffer() buffer.add_event( EventMatch('id', 'version', 'log', 'dedup', {'key1': 'value1'})) buffer.add_event( EventMatch('id', 'version', 'log', 'dedup', {'key2': 'value2'})) self.assertEqual(len(buffer.data), 1) DDB_MOCK.update_item.return_value = { 'Attributes': { 'alertCount': { 'N': '1' } } } buffer.flush() DDB_MOCK.update_item.assert_called_once() S3_MOCK.put_object.assert_called_once() SNS_MOCK.publish.assert_called_once() _, call_args = S3_MOCK.put_object.call_args data = GzipFile(None, 'rb', fileobj=call_args['Body']) # Verify first event event1 = json.loads(data.readline().decode('utf-8')) self.assertIsNotNone( datetime.strptime(event1['p_alert_creation_time'], '%Y-%m-%d %H:%M:%S.%f000')) self.assertIsNotNone( datetime.strptime(event1['p_alert_update_time'], '%Y-%m-%d %H:%M:%S.%f000')) self.assertEqual(event1['p_rule_id'], 'id') self.assertEqual(event1['p_alert_id'], 'id-1') self.assertEqual(event1['key1'], 'value1') # Verify first event event2 = json.loads(data.readline().decode('utf-8')) self.assertIsNotNone( datetime.strptime(event2['p_alert_creation_time'], '%Y-%m-%d %H:%M:%S.%f000')) self.assertIsNotNone( datetime.strptime(event2['p_alert_update_time'], '%Y-%m-%d %H:%M:%S.%f000')) self.assertEqual(event2['p_rule_id'], 'id') self.assertEqual(event2['p_alert_id'], 'id-1') self.assertEqual(event2['key2'], 'value2') # Assert that the buffer has been cleared self.assertEqual(len(buffer.data), 0) self.assertEqual(buffer.bytes_in_memory, 0)
def __init__(self, playfile, playback): instream = GzipFile(playback) header = instream.readline().rstrip().decode() if not (header.startswith("Kye ") and header.endswith(" recording:")): raise KDemoFormatError() # Check filename in the demo is what we have loaded. fn = instream.readline().rstrip().decode() if fn != os.path.basename(playfile): raise KDemoFileMismatch(fn) # Okay self.__level = instream.readline().rstrip().decode() self.__rng = pickle.load(instream) self.__s: GzipFile = instream
def build_index_gzip(self): """creates sorted index from gzip-compressed queue. caches object regardless of caccheobj flag. """ self.index = [] zf = GzipFile(fileobj=self.map, mode="rb") while 1: p = zf.tell() # just for diagnosis use try: l = zf.readline() except IOError as ex: # probably CRC error due to truncated file. discard the rest. logging.error("error in %s at %d: %s", self.fn, p, str(ex)) break if not l: break if l[0] != " ": continue try: o = cjson.decode(l[1:]) except Exception as ex: logging.warn("skipping malformed JSON at %s:%d: %s", self.fn, p, l[1:]) continue key = o.get("id") if key is None: try: key = self.urikey(o) except UnicodeEncodeError: pass if key is None: logging.error("urikey->None for %s", str(o)) continue self.index.append((key, o)) zf.close()
def getNewMrtgData(madeAdmUtilSnmpd): try: dbgOut = u" bootstrap: there was an error on mrtg data" dataFile = GzipFile("lib/python/org/ict_ok/admin_utils/snmpd/snmp_mrtg_data.gz", "rb") if dataFile.readline() == "## mrtg data file for ict_ok.org\n": timeStamp = float(dataFile.readline()) all_templ_data = pickle.load(dataFile) dataFile.close() madeAdmUtilSnmpd.mrtg_data = copy.deepcopy(all_templ_data) madeAdmUtilSnmpd.mrtg_data_timestamp = timeStamp dbgOut = u" bootstrap: new mrtg data (%s) loaded" % \ (time.strftime("%Y-%m-%d %H:%M:%S +00",time.gmtime(timeStamp))) except ValueError: dbgOut = u" bootstrap: Hmm, format of mrtg data file incorrect" except IOError: dbgOut = u" bootstrap: Hmm, no mrtg data file" return dbgOut
def articles(wiki_json_fn, limit=None): count = 0 _, ext = os.path.splitext(wiki_json_fn) if ext == '.gz': f = GzipFile(wiki_json_fn, mode='r') elif ext == '.bz2': f = BZ2File(wiki_json_fn, mode='r') else: f = io.open(wiki_json_fn, mode='rb') while True: line = f.readline() if line == b'': break action = json.loads(line.decode('utf-8')) line = f.readline() if line == b'': break source = json.loads(line.decode('utf-8')) if is_page(action, source): yield { 'id': action['index']['_id'], 'title': source['title'], 'text': source['text'] } count += 1 if limit and count > limit: return if count % 10000 == 0: logging.info("read %d articles" % count) f.close()
def _loadFeatureTable(self): filename = self.dirname + FEATURE_FILENAME f = GzipFile(filename) # discard first line f.readline() while True: line = f.readline().rstrip() if line == '': break (feat, n) = self._parseHashTableLine(line) self.emb.featTable[feat] = n self.emb.rFeatTable[n] = feat f.close()
def _loadFeatureTable(self): filename = self.dirname + FEATURE_FILENAME if not os.path.exists(filename): return f = GzipFile(filename) # discard first line f.readline() while True: line = f.readline().rstrip() if line == '': break (feat, n) = self._parseHashTableLine(line) self.emb.featTable[feat] = n self.emb.rFeatTable[n] = feat f.close()
def is_athena_project(filename): """tests whether file is a valid Athena Project file""" result = False if os.path.exists(filename): try: fh = GzipFile(filename) line1 = bytes2str(fh.readline()) result = "Athena project file -- Demeter version" in line1 except: pass finally: fh.close() return result
class PathListRead_gzip(object): def __init__(self, f): self.F = f self.G = GzipFile(fileobj=f, mode="rb") def read(self): done = False while not done: line = self.G.readline() if not line: return None # EOF path = line.strip() return path def paths(self): done = False while not done: path = self.read() if not path: break yield path
def getFileObjects(self): inputs = [] for f in self.files: if f: fileid, fd, offset,size = f fd.seek(offset,0) content = BytesIO(fd.read(size)) input = GzipFile(fileid,'rb',fileobj=content) try: firstline = input.readline().decode("utf-8","ignore") if self.subformat == "ssa": input = self.convertFromSsa(input) elif self.subformat != "srt": input = self.convertFromSub(input) elif re.match("\{\d+\}\{\d+\}", firstline): input = self.convertFromSub(input) input.seek(0) except: sys.stderr.write("Conversion problem: %s\n"%sys.exc_info()[1]) continue # special case for Georgian (Python does not include the georgian-ps encoding) if self.langcode=="ka": inputtext = input.read() import chardet if "utf-8" not in chardet.detect(inputtext)["encoding"].lower(): sys.stderr.write("Converting Georgian subtitle to UTF-8\n") p = subprocess.Popen("iconv -f georgian-ps -t utf-8", shell=True, stdin=subprocess.PIPE,stdout=subprocess.PIPE) out, err = p.communicate(inputtext) input = BytesIO(out) else: input = BytesIO(inputtext) inputs.append(input) return inputs
class ezfio_obj(object): def __init__(self,read_only=False): self._filename = "EZFIO_File" self.buffer_rank = -1 self.read_only = read_only self.locks = {} def acquire_lock(self,var): locks = self.locks try: locks[var].acquire() except: locks[var] = threading.Lock() locks[var].acquire() def release_lock(self,var): self.locks[var].release() def set_read_only(self,v): self.read_only = v def get_read_only(self): return self.read_only def exists(self,path): if os.access(path+'/.version',os.F_OK) == 1: file = open(path+'/.version',"r") v = file.readline().strip() file.close() else: return False def mkdir(self,path): if self.read_only: self.error('Read-only file.') if self.exists(path): self.error('mkdir','Group '+path+' exists') try: os.mkdir(path.strip()) except OSError: pass file = open(path.strip()+'/.version','w') print >>file,self.version file.close() def error(self,where,txt): print '------------------------------------------------------------' print 'EZFIO File : '+self.filename print 'EZFIO Error in : '+where.strip() print '------------------------------------------------------------' print '' print txt.strip() print '' print '------------------------------------------------------------' raise IOError def get_filename(self): if not self.exists(self._filename): self.mkdir(self._filename) return self._filename def set_filename(self,filename): self._filename = filename filename = property(fset=set_filename,fget=get_filename) def set_file(self,filename): self.filename = filename if not self.exists(filename): self.mkdir(filename) self.mkdir(filename+"/ezfio") os.system(""" LANG= date > %s/ezfio/creation echo $USER > %s/ezfio/user echo %s > %s/ezfio/library"""%(filename,filename,self.LIBRARY,filename)) def open_write_buffer(self,dir,fil,rank): if self.read_only: self.error('Read-only file.') l_filename=dir.strip()+'/'+fil+'.gz' if self.buffer_rank != -1: self.error('open_write_buffer','Another buffered file is already open.') self.buffer_rank = rank assert (self.buffer_rank > 0) try: self.file = GzipFile(filename=l_filename,mode='wb7') except IOError: self.error('open_write_buffer','Unable to open buffered file.') self.file.write("%2d\n"%(rank,)) def open_read_buffer(self,dir,fil,rank): l_filename=dir.strip()+'/'+fil+'.gz' if self.buffer_rank != -1: self.error('open_read_buffer','Another buffered file is already open.') try: self.file = GzipFile(filename=l_filename,mode='rb') except IOError: self.error('open_read_buffer','Unable to open buffered file.') try: rank = eval(self.file.readline()) except IOError: self.error('open_read_buffer','Unable to read buffered file.') self.buffer_rank = rank assert (self.buffer_rank > 0) return rank def close_buffer(self): assert (self.buffer_rank > 0) self.buffer_rank = -1 self.file.close() def read_buffer(self,isize): if self.buffer_rank == -1: self.error('read_buffer','No buffered file is open.') indices = [] values = [] for i in xrange(isize): try: line = self.file.readline().split() except: return indices, values if len(line) == 0: return indices, values indices.append ( [ int(i) for i in line[:-1] ] ) values.append (eval(line[-1])) return indices, values def write_buffer(self,indices,values,isize): if self.read_only: self.error('Read-only file.') if self.buffer_rank == -1: self.error('write_buffer','No buffered file is open.') for i in xrange(isize): for j in indices[i]: self.file.write("%4d "%(j,)) self.file.write("%24.15e\n"%(values[i],))
if __name__ == '__main__': parser = argparse.ArgumentParser( description='Log-Bilinear model for relation extraction.') _arg = parser.add_argument _arg('--read-dump', type=str, action='store', metavar='PATH', help='Reads in a wikidata json dump.') args = parser.parse_args() train_set = None if args.read_dump: dump_in = GzipFile(args.read_dump, 'r') line = dump_in.readline() iter = 0 while line != '': iter += 1 line = dump_in.readline() try: ent = json.loads(line.rstrip('\n,')) if not ent['id'].startswith('Q'): print("Skipping item with id {}".format(ent['id']), file=sys.stderr) continue print('\n'.join( ['{}\t{}\t{}'.format(*t) for t in to_triplets(ent)]), file=sys.stdout) except (KeyError, ValueError) as e: print(e, file=sys.stderr)
class LazyJsonReader(object): chunk_size = 2048 """Newline-separated json log reader tolerating massive log files""" def __init__(self, file_path, file_gzipped=False): self.gz = file_gzipped self.file = GzipFile(file_path, 'rb') if file_gzipped else open(file_path, 'rb') """ As bytes are read from the file a line count is kept. At any time we know: * the position of our pointer in the file's contents * what line number we are on So, it should be possible to fetch the previous/next line with some crafty seeking """ self.line = 0 def _get_position(self): """ Return set of (current_line, current_file_position) """ return (self.line, self.file.tell()) def _seek_to(self, line, pos): """ Seek to arbitrary locations. There's no logic here, this method assumes the line number and position specified are correct. """ self.line = line self.file.seek(pos) def decode(self, s): return s.decode('UTF-8') def read_next(self): """ Read the next line from the file, parse and return. Returns None if out of lines. """ data = self.file.readline().strip() if data: self.line += 1 return json.loads(self.decode(data)) if data else None def read_prev(self): """ Read the previous line from the file, parse and return. Returns None if out of lines. """ original_pos = current_pos = self.file.tell() # can't fall off the beginning if current_pos == 0: return None # rewind by chunk_size and read chunk_size bytes # repeat until we've found TWO \n - the end of the previous line, and the beginning of the line before the line we want # then split n grab #print(current_pos) rewound_chunk = b"" while rewound_chunk.count(b"\n") < 3: # changed from 2 to 3 to fix partial reads before_jump = current_pos # Jump backwards x bytes, and prevent falling off the start current_pos = max(0, current_pos-self.chunk_size) self.file.seek(current_pos) jumped_by = before_jump-current_pos # prepend the chunk to our buffer rewound_chunk = b''.join([self.file.read(jumped_by), rewound_chunk]) #rewound_chunk = ''.join([rewound_chunk, '|||||', self.file.read(jumped_by)]) #print("Read ", jumped_by) # If we just read from the beginning of the file this loop should break regardless if current_pos == 0: break # we have a chunk containing at least one full line # find the last line in the chunk lines_split = rewound_chunk.split(b"\n") # -1 => blank # -2 => last line emitted # -3 => previous line. wont exist if we hit BOF # -4+ => line before that and/or partial line garbage if len(lines_split) < 3: self.line = 0 self.file.seek(0) return json.loads(self.decode(lines_split[0])) prev_line = lines_split[-2] # Calculate how far backwards we jumped, seek to the beginning of the line we're returning # TODO should it be elsewhere so if next_line is called we dont get this line again? after_prev_line = lines_split[-1:] rewound_len = len(b"\n".join([prev_line] + after_prev_line)) self.file.seek(original_pos - rewound_len) self.line -= 1 return json.loads(self.decode(prev_line))
class TRJ_reader(abstract_trajectory_reader): """Read LAMMPS trajectory file This is a naive (and comparatively slow) implementation, written entirely in python. """ @classmethod def reader_available(cls): return True def __init__(self, filename, x_factor=0.1, t_factor=1.0): if filename.endswith('.gz'): from gzip import GzipFile self._fh = GzipFile(filename, 'r') elif filename.endswith('.bz2'): from bz2 import BZ2File self._fh = BZ2File(filename, 'r') else: self._fh = open(filename, 'r') self._open = True self._item_re = \ re.compile(r'^ITEM: (TIMESTEP|NUMBER OF ATOMS|BOX BOUNDS|ATOMS) ?(.*)$') self.x_factor = x_factor self.t_factor = t_factor self.v_factor = x_factor / t_factor self._first_called = False self._index = count(1) # ITEM: TIMESTEP # 81000 # ITEM: NUMBER OF ATOMS # 1536 # ITEM: BOX BOUNDS pp pp pp # 1.54223 26.5378 # 1.54223 26.5378 # 1.54223 26.5378 # ITEM: ATOMS id type x y z vx vy vz # 247 1 3.69544 2.56202 3.27701 0.00433856 -0.00099307 -0.00486166 # 249 2 3.73324 3.05962 4.14359 0.00346029 0.00332502 -0.00731005 # 463 1 3.5465 4.12841 5.34888 0.000523332 0.00145597 -0.00418675 def _read_frame_header(self): while True: L = self._fh.readline() m = self._item_re.match(L) if not m: if L == '': self._fh.close() self._open = False raise StopIteration if L.strip() == '': continue raise IOError( "TRJ_reader: Failed to read/parse TRJ frame header") if m.group(1) == "TIMESTEP": step = int(self._fh.readline()) elif m.group(1) == "NUMBER OF ATOMS": natoms = int(self._fh.readline()) elif m.group(1) == "BOX BOUNDS": bbounds = [ map(float, self._fh.readline().split()) for _ in range(3) ] x = array(bbounds) box = np.diag(x[:, 1] - x[:, 0]) if x.shape == (3, 3): box[1, 0] = x[0, 2] box[2, 0] = x[1, 2] box[2, 1] = x[2, 2] elif x.shape != (3, 2): raise IOError( 'TRJ_reader: Malformed box bounds in TRJ frame header') elif m.group(1) == "ATOMS": cols = tuple(m.group(2).split()) # At this point, there should be only atomic data left return (step, natoms, box, cols) def _get_first(self): # Read first frame, update state of self, create indexes etc step, N, box, cols = self._read_frame_header() self._natoms = N self._step = step self._cols = cols self._box = box def _all_in_cols(keys): for k in keys: if not k in cols: return False return True self._x_map = None if _all_in_cols(('id', 'xu', 'yu', 'zu')): self._x_I = array(map(cols.index, ('xu', 'yu', 'zu'))) elif _all_in_cols(('id', 'x', 'y', 'z')): self._x_I = array(map(cols.index, ('x', 'y', 'z'))) elif _all_in_cols(('id', 'xs', 'ys', 'zs')): self._x_I = array(map(cols.index, ('xs', 'ys', 'zs'))) _x_factor = self._box.diagonal().reshape((3, 1)) # xs.shape == (3,n) self._x_map = lambda xs: xs * _x_factor else: raise RuntimeError('TRJ file must contain at least atom-id, x, y, ' 'and z coordinates to be useful.') self._id_I = cols.index('id') if _all_in_cols(('vx', 'vy', 'vz')): self._v_I = array(map(cols.index, ('vx', 'vy', 'vz'))) else: self._v_I = None if 'type' in cols: self._type_I = cols.index('type') else: self._type_I = None data = array( [map(float, self._fh.readline().split()) for _ in range(N)]) I = np.asarray(data[:, self._id_I], dtype=np.int) # Unless dump is done for group "all" ... I[np.argsort(I)] = arange(len(I)) self._x = zeros((3, N), order='F') if self._x_map is None: self._x[:, I] = data[:, self._x_I].transpose() else: self._x[:, I] = self._x_map(data[:, self._x_I].transpose()) if self._v_I is not None: self._v = zeros((3, N), order='F') self._v[:, I] = data[:, self._v_I].transpose() def _get_next(self): # get next frame, update state of self step, N, box, cols = self._read_frame_header() assert (self._natoms == N) assert (self._cols == cols) self._step = step self._box = box data = array( [map(float, self._fh.readline().split()) for _ in range(N)]) I = np.asarray(data[:, self._id_I], dtype=np.int) - 1 if self._x_map is None: self._x[:, I] = data[:, self._x_I].transpose() else: self._x[:, I] = self._x_map(data[:, self._x_I].transpose()) if self._v_I is not None: self._v[:, I] = data[:, self._v_I].transpose() def __iter__(self): return self def close(self): if not self._fh.closed: self._fh.close() def next(self): if not self._open: raise StopIteration if self._first_called: self._get_next() else: self._get_first() res = dict( index=self._index.next(), N=int(self._natoms), box=self.x_factor * self._box.copy('F'), time=self.t_factor * self._step, x=self.x_factor * self._x, ) if self._v_I is not None: res['v'] = self.v_factor * self._v return res
class QueueFileReader(object): '''reads (dequeues) from single queue file''' def __init__(self, qfile, noupdate=False): self.fn = qfile self.noupdate = noupdate self.map = None self.open() def open(self): fd = os.open(self.fn, os.O_RDWR) self.pos = 0 sig = os.read(fd, 2) # check for gzip signature if sig == '\x1f\x8b': # there's no added benefit to mmap gzip file, I guess os.lseek(fd, 0, 0) self.z = GzipFile(fileobj=os.fdopen(fd), mode='rb') self.__next = self.__next_gzip else: self.z = None self.map = mmap.mmap(fd, 0, access=mmap.ACCESS_WRITE) # mmap dups fd, fd need not be kept open. os.close(fd) self.__next = self.__next_mmap def close(self): if self.z: self.z.close() self.z = None if self.map: self.map.close() self.map = None def __next_mmap(self): if self.map is None: logging.warn("QueueFileReader:next called on closed file:%s", self.fn) raise StopIteration while self.pos < self.map.size(): el = self.map.find('\n', self.pos + 1) if el < 0: el = self.map.size() s = self.pos self.pos = el + 1 if self.map[s] == ' ': l = self.map[s + 1:el] if not self.noupdate: self.map[s] = '#' try: return json.loads(l) except Exception as ex: logging.warn('malformed line in %s at %d: %s', self.fn, s, l) continue raise StopIteration def __next_gzip(self): while 1: try: l = self.z.readline() except Exception as ex: # probably CRC error due to truncated file. discard the rest. # should we keep the file for later diagnosis? we can get # IOError from gzip, as well as zlib.error for lower level # problems. logging.error('error in %s: %s', self.fn, str(ex)) raise StopIteration if l == '': break if l[0] != ' ': continue try: return json.loads(l[1:]) except Exception as ex: logging.warn('malformed line in %s: %s', self.fn, l) continue raise StopIteration def next(self): return self.__next()
# FOR EACH SAMPLE, READ IN MATCHES. CATALOG LOCUS IS [2], SAMPLE LOCUS IS [4], COV IS [6] # ADD COVERAGE TO CORRECT INDEX FOR THE KEY == CATALOG LOCUS for i in range(nsamples): sample = samples[i] if gzipped == 1: matches = stacks+"/"+sample+".matches.tsv.gz" sys.stderr.write("Parsing "+sample+".matches.tsv.gz...\n") logfile.write("Parsing "+sample+".matches.tsv.gz...\n") matches = GzipFile(matches, 'r') else: matches = stacks+"/"+sample+".matches.tsv" sys.stderr.write("Parsing "+sample+".matches.tsv...\n") logfile.write("Parsing "+sample+".matches.tsv...\n") matches = open(matches, 'r') matches.readline() nmatches = 0 for match in matches: match = match.strip('\n').split('\t') clocus = match[2] slocus = match[4] coverage = int(match[6]) if clocus in locusCoverage: nmatches += 1 locusCoverage[clocus][i] += coverage if nmatches % 1000 == 0: sys.stderr.write("Matches found (incl. alt. alleles):\t%s\r"%(nmatches)) matches.close() sys.stderr.write("Matches found (incl. alt. alleles):\t%s\n"%(nmatches)) logfile.write("Matches found (incl. alt. alleles):\t%s\n"%(nmatches))
class Rdata(object): def __init__(self, name, mode='rb', buffering=1): self._file=None self._format=None self._compression=NO_COMPRESSION # file version info self._version = None self._rversion = None self._min_rversion = None self.open(name, mode, buffering) self.readHeader() def open(self, name, mode, buffering): # Open as ordinary binary file and check type # reopen as necessary using required class self._file = open(name, mode, buffering) magic = self._file.read(3) # gzip if magic[0] == '\x1f' and magic[1] == '\x8b' and magic[2] == '\x08': self._file.close() self._file = GzipFile(name, mode, buffering) # bz2 elif magic[0]=='\x42' and magic[1]=='\x5a' and magic[2]=='\x68': self._compression=BZIP2_COMPRESSION self._file.close() self._file = BZ2File(name, mode, buffering) # xz elif magic[0]=='\xfd' and magic[1]=='\x37' and magic[2]=='\x7a': raise NotImplementedError("xz compression not supported") # no compression, read from file as-is starting from beginning else: self._file.seek(0) # read n bytes from file def read(self, n=1): return self._file.read(n) # Read header for format and file version def readHeader(self): magic = self._file.readline().strip() # check magic number # TODO: implement multiple file versions if magic in ["RDX2"]: self._format=XDR_FILE # version 1 ASCII, binary, xdr elif magic in ["RDA1", "RDB1", "RDX1"]: self._version = 1 raise NotImplementedError("Version 1 saves not yet supported") # version 2 ASCII, binary elif magic in ["RDA2", "RDB2"]: self._version=2 raise NotImplementedError("Version 2 non-XDR saves not yet supported") elif magic[:2] == "RD": # magic number looks legit but did not match any known types raise RuntimeError("Unknown save version found") else: # this doesn't even look like an R save file raise RuntimeError("Unknown file format- are you sure this is an R save file?") ftype = self._file.readline().strip() #assert ftype == 'X' self._version = self.getInteger() self._rversion = "%d.%d.%d" % self.decodeVersion(self.getInteger()) self._min_rversion = "%d.%d.%d" % self.decodeVersion(self.getInteger()) if self._version is not 2: raise RuntimeException("Only version 2 saves supported") def close(self): self._file.close() def getLength(self): length = self.getInteger() # very long arrays might need 64 bits to store length if length is -1: len1 = long( self.getInteger() ) len2 = long( self.getInteger() ) length = ( len1 << 32 ) + len2 return length def getInteger(self): """ Parse the next 4 bytes in the stream as an integer """ if self._format == XDR_FILE: return xdrlib.Unpacker(self._file.read(R_XDR_INTEGER_SIZE)).unpack_int() # serialize.c contains UnpackFlags(...) def getFlags(self): """ Parse and return SEXP flags as in Serialize.c:UnpackFlags Takes in an integer containing object flags Returns a list of 5 items """ flags = self.getInteger() ptype = flags & 255 plevs = flags >> 12 pisobj = True if flags & (1 << 8) else False phasattr = True if flags & (1 << 9) else False phastag = True if flags & (1 << 10) else False log.info("%s %s %s %s", SEXP_TYPES[ptype], pisobj, phasattr, phastag) return (ptype, plevs, pisobj, phasattr, phastag) def getIntegerVec(self): """ Read a vector of integers """ length = self.getLength() unpacker = xdrlib.Unpacker(self.read(length*R_XDR_INTEGER_SIZE)) return unpacker.unpack_farray(length, unpacker.unpack_int) def getRealVec(self): """ Get a vector of real numbers from the input stream, assuming numbers are stored according to XDR double floating point standard """ length = self.getLength() unpacker = xdrlib.Unpacker(self._file.read(length*SIZEOF_DOUBLE)) return unpacker.unpack_farray(length, unpacker.unpack_double) def getComplex(stream): raise NotImplementedError("Complex values not yet implemented") def getComplexVec(stream, length): raise NotImplementedError("Complex values not yet implemented") def getChar(self, n = 1): """ Get n characters from the stream and return them as a string """ if n<0: return "NA" else: return self._file.read(n) def getString(self): length = self.getInteger() # null terminated strings log.info("length = %d", length) string = self.getChar(length) log.indent();log.debug("'%s'", string);log.dedent() return string def decodeVersion(self, packed): """ Decode packed version number into a human readable format """ v = packed / 65536 packed %= 65536 p = packed / 256 packed %= 256 s = packed return (v, p, s)
sys.exit(1) fd = GzipFile(args[0], 'r') #aux buffer for new modified file output="" #aux dict with keys src_dst combination and its corresponding line of the file as value. src_dst_dic={} #aux boolean dict with keys src_dst combination. 0=no response from DST yet, 1=response from DST for the last src_dst message. response={} #adjustable step time for messages in the same session step = float(args[1]) while 1: line = fd.readline() #if script gets to the end of file and line is empty, break the loop if line == "": break line = line.split(' ') src_dst = line[2]+"_"+line[3] dst_src = line[3]+"_"+line[2] line[4] = (' '.join(str(n) for n in line[4:])).replace("\n","") line = line[0:5] #check if there's an entry for this combination of SRC and DST if src_dst in src_dst_dic: if response[src_dst] == 0 and float(line[0]) - float(src_dst_dic[src_dst][0]) < step: src_dst_dic[src_dst][4] += line[4] else: output += ' '.join(src_dst_dic[src_dst])+"\n"
def readline_checkEnd(self, size=-1): line = GzipFile.readline(self, size) if self.stop is not None and line[:self.stoplen] == self.stop: return '' return line
class TRJ_reader(abstract_trajectory_reader): """Read LAMMPS trajectory file This is a naive (and comparatively slow) implementation, written entirely in python. """ @classmethod def reader_available(cls): return True def __init__(self, filename, x_factor=0.1, t_factor=1.0): if filename.endswith('.gz'): from gzip import GzipFile self._fh = GzipFile(filename, 'r') elif filename.endswith('.bz2'): from bz2 import BZ2File self._fh = BZ2File(filename, 'r') else: self._fh = open(filename,'r') self._open = True self._item_re = \ re.compile(r'^ITEM: (TIMESTEP|NUMBER OF ATOMS|BOX BOUNDS|ATOMS) ?(.*)$') self.x_factor = x_factor self.t_factor = t_factor self.v_factor = x_factor/t_factor self._first_called = False self._index = count(1) # ITEM: TIMESTEP # 81000 # ITEM: NUMBER OF ATOMS # 1536 # ITEM: BOX BOUNDS pp pp pp # 1.54223 26.5378 # 1.54223 26.5378 # 1.54223 26.5378 # ITEM: ATOMS id type x y z vx vy vz # 247 1 3.69544 2.56202 3.27701 0.00433856 -0.00099307 -0.00486166 # 249 2 3.73324 3.05962 4.14359 0.00346029 0.00332502 -0.00731005 # 463 1 3.5465 4.12841 5.34888 0.000523332 0.00145597 -0.00418675 def _read_frame_header(self): while True: L = self._fh.readline() m = self._item_re.match(L) if not m: if L == '': self._fh.close() self._open = False raise StopIteration if L.strip() == '': continue raise IOError("TRJ_reader: Failed to read/parse TRJ frame header") if m.group(1) == "TIMESTEP": step = int(self._fh.readline()) elif m.group(1) == "NUMBER OF ATOMS": natoms = int(self._fh.readline()) elif m.group(1) == "BOX BOUNDS": bbounds = [map(float, self._fh.readline().split()) for _ in range(3)] x = array(bbounds) box = np.diag(x[:,1]-x[:,0]) if x.shape == (3,3): box[1,0] = x[0,2] box[2,0] = x[1,2] box[2,1] = x[2,2] elif x.shape != (3,2): raise IOError('TRJ_reader: Malformed box bounds in TRJ frame header') elif m.group(1) == "ATOMS": cols = tuple(m.group(2).split()) # At this point, there should be only atomic data left return (step, natoms, box, cols) def _get_first(self): # Read first frame, update state of self, create indexes etc step, N, box, cols = self._read_frame_header() self._natoms = N self._step = step self._cols = cols self._box = box def _all_in_cols(keys): for k in keys: if not k in cols: return False return True self._x_map = None if _all_in_cols(('id','xu','yu','zu')): self._x_I = array(map(cols.index, ('xu','yu','zu'))) elif _all_in_cols(('id','x','y','z')): self._x_I = array(map(cols.index, ('x','y','z'))) elif _all_in_cols(('id','xs','ys','zs')): self._x_I = array(map(cols.index, ('xs','ys','zs'))) _x_factor = self._box.diagonal().reshape((3,1)) # xs.shape == (3,n) self._x_map = lambda xs : xs * _x_factor else: raise RuntimeError('TRJ file must contain at least atom-id, x, y, ' 'and z coordinates to be useful.') self._id_I = cols.index('id') if _all_in_cols(('vx','vy','vz')): self._v_I = array(map(cols.index, ('vx','vy','vz'))) else: self._v_I = None if 'type' in cols: self._type_I = cols.index('type') else: self._type_I = None data = array([map(float, self._fh.readline().split()) for _ in range(N)]) I = np.asarray(data[:,self._id_I], dtype=np.int) # Unless dump is done for group "all" ... I[np.argsort(I)] = arange(len(I)) self._x = zeros((3,N), order='F') if self._x_map is None: self._x[:,I] = data[:,self._x_I].transpose() else: self._x[:,I] = self._x_map(data[:,self._x_I].transpose()) if self._v_I is not None: self._v = zeros((3,N), order='F') self._v[:,I] = data[:,self._v_I].transpose() def _get_next(self): # get next frame, update state of self step, N, box, cols = self._read_frame_header() assert(self._natoms == N) assert(self._cols == cols) self._step = step self._box = box data = array([map(float, self._fh.readline().split()) for _ in range(N)]) I = np.asarray(data[:,self._id_I], dtype=np.int)-1 if self._x_map is None: self._x[:,I] = data[:,self._x_I].transpose() else: self._x[:,I] = self._x_map(data[:,self._x_I].transpose()) if self._v_I is not None: self._v[:,I] = data[:,self._v_I].transpose() def __iter__(self): return self def close(self): if not self._fh.closed: self._fh.close() def next(self): if not self._open: raise StopIteration if self._first_called: self._get_next() else: self._get_first() res = dict( index = self._index.next(), N = int(self._natoms), box = self.x_factor*self._box.copy('F'), time = self.t_factor*self._step, x = self.x_factor*self._x, ) if self._v_I is not None: res['v'] = self.v_factor*self._v return res
sys.exit(1) fd = GzipFile(args[0], 'r') #aux buffer for new modified file output="" #aux dict with keys src_dst combination and its corresponding line of the file as value. src_dst_dic={} #aux boolean dict with keys src_dst combination. 0=no response from DST yet, 1=response from DST for the last src_dst message. response={} #adjustable step time for messages in the same session step = float(args[1]) while 1: line = fd.readline() #if script gets to the end of file and line is empty, break the loop if line == "": break line = line.split(' ') src_dst = line[2]+"_"+line[3] dst_src = line[3]+"_"+line[2] line[4] = (' '.join(str(n) for n in line[4:])).replace("\n","") line = line[0:5] #check if there's an entry for this combination of SRC and DST if src_dst_dic.has_key(src_dst): if response[src_dst] == 0 and float(line[0]) - float(src_dst_dic[src_dst][0]) < step: src_dst_dic[src_dst][4] += line[4] else: output += ' '.join(src_dst_dic[src_dst])+"\n"
def recover_one(self, folder, password, destination): log("recover_one", folder, password, destination) save_cwd = os.getcwd() os.chdir(folder) encrypted = len(password) > 0 tmp_dir = tempfile.mkdtemp() try: ############ # # PROCESSING THE TAR FILE # ############ if encrypted: log("Password required") pass_file = os.path.join(tmp_dir, "pwd") os.mkfifo(pass_file, 0600) cat = Popen('find data -type f -print | sort | xargs cat', shell=True, stdout=PIPE) openssl = Popen("/usr/bin/openssl enc -d -aes256 -md sha256 -pass 'file:%s'" % pass_file, shell=True, stdin=cat.stdout, stdout=PIPE) tar = Popen("/bin/tar -xzf - --directory '%s'" % (destination,), shell=True, stdin=openssl.stdout, stdout=PIPE, stderr=PIPE) # Send the passphrase via a pipe file. # It wont appear in the process list (via cmd line arg). tmp_fd = open(pass_file, "w") tmp_fd.write(password) tmp_fd.close() else: log("Starting tar") cat = Popen('find data -type f -print | sort | xargs cat', shell=True, stdout=PIPE) tar = Popen("/bin/tar -xzf - --directory '%s'" % destination, shell=True, stdin=cat.stdout, stdout=PIPE, stderr=PIPE) # Wait for the TAR to finish log('Waiting for tar to finish') stdout, stderr = tar.communicate() log("stderr=", stderr) log("stdout=", stdout) tar.wait() log("Main extraction complete") print("Errors:", stderr) ############ # # PROCESSING THE LOF FILE # ############ if encrypted: log("starting lof processing") pass_file = os.path.join(tmp_dir, "pwd2") os.mkfifo(pass_file, 0600) lof_file = os.path.join(tmp_dir, "lof") openssl = Popen("/usr/bin/openssl enc -d -aes256 -md sha256 -pass 'file:%s' -in lof.enc -out '%s'" % (pass_file, lof_file), shell=True, stdout=PIPE) # Send the passphrase via a pipe file. # It wont appear in the process list (via cmd line arg). tmp_fd = open(pass_file, "w") tmp_fd.write(password) tmp_fd.close() openssl.wait() lof = GzipFile(lof_file, mode="rb") else: lof = GzipFile("lof", "r") try: log("Start LOF processing for Deletes") while True: line = lof.readline() if not line: log("Done!") break # Remove the \n line = line[:-1] log("line", line) if line == "": # New folder (remember to strip the \n folder = lof.readline()[:-1].decode("quopri_codec") log("New folder", folder) continue parts = line.split(",") log("Line parts:", parts) name = parts[0].decode("quopri_codec") type = parts[1] if type != "X": log("Not a delete") continue if folder[0] == os.sep: folder = folder[1:] path = os.path.join(destination, folder, name) log("DELETE ", path) if os.path.isdir(path): shutil.rmtree(path) else: os.remove(path) finally: lof.close() except Exception as e: print("Got exception in recover:", str(e)) finally: # Remove the temp dir shutil.rmtree(tmp_dir) os.chdir(save_cwd)
class BigTxtFile: def __init__(self, fp, inibuff=100, header_preffix=None, split=None): ''' BigTextFile(file_obj[, inibuff=100, header_preffix=None, split=None]) -> BigTextFile fp - must be a file object (seek+tell methods) inibuff - Initial buffer in lines to be read and detect CRlen header_preffix - If provided inibuff will be discarded and all the lines starting with header_preffix will be treated as a different part of the file (header) split - If provided, every line after header will be split by the provided delimiter''' assert type(fp) in [str, file] if type(fp) == str: try: self.fp = GzipFile(fp) self.fp.readline() self.fp.seek(0) except: self.fp = open(fp) else: self.fp = fp self.header = [] self.inibuff = [] self.lenCR = 1 self.split = split self.bodypos = 0 if header_preffix: for i in self.fp: self.bodypos += len(i) if len(i.strip()) != len(i): self.lenCR = len(i) - len(i.strip()) if i.startswith(header_preffix): self.header.append(i.strip()) else: self.bodypos -= len(i) break else: for i in self.fp: self.inibuff.append(i) inibuff -= 1 if inibuff < 1: break if self.inibuff: self.lenCR = len(self.inibuff[0]) - len(self.inibuff[0].strip()) self._initialise() def _initialise(self): '''Abstract method in case you want postprocess headers or indexing''' pass def _body(self): '''Generator for each line of the body (whole file if header not provided)''' self.fp.seek(self.bodypos) if self.split: for i in self.fp: yield i[:-self.lenCR].split(self.split) else: for i in self.fp: yield i[:-self.lenCR] def _header(self): '''Generator for the header records''' for i in self.header: yield i def __iter__(self): '''Generator for the whole file''' for i in itertools.chain(self._header(), self._body()): yield i
import sys import os import codecs from datetime import datetime, timedelta output = codecs.open('kv7kalender.idx', 'w', 'UTF-8') lastupdated_filenames = {} kalender_filenames = [sys.argv[1] + '/' + x for x in os.listdir(sys.argv[1])] kalender_threshold = (datetime.now() - timedelta(days=3)).isoformat() for filename in sorted(kalender_filenames): f = GzipFile(filename, 'r') try: firstline = f.readline()[:-1] values = firstline.split('|') subscription = values[2] creationdate = values[7] if (creationdate < kalender_threshold): continue if subscription not in lastupdated_filenames: lastupdated_filenames[subscription] = { 'filename': filename, 'creationdate': creationdate } elif creationdate > lastupdated_filenames[subscription]['creationdate']: lastupdated_filenames[subscription] = { 'filename': filename, 'creationdate': creationdate }
class QueueFileReader(object): '''reads (dequeues) from single queue file''' def __init__(self, qfile, noupdate=False): self.fn = qfile self.noupdate = noupdate self.map = None self.open() def open(self): fd = os.open(self.fn, os.O_RDWR) self.map = mmap.mmap(fd, 0, access=mmap.ACCESS_WRITE) # mmap dups fd. fd need not be kept open. os.close(fd) self.pos = 0 if self.map[0:2] == '\x1f\x8b': self.z = GzipFile(fileobj=self.map, mode='rb') self.__next = self.__next_gzip else: self.z = None self.__next = self.__next_mmap def close(self): if self.z: self.z.close() self.z = None if self.map: self.map.close() self.map = None def __next_mmap(self): while self.pos < self.map.size(): el = self.map.find('\n', self.pos + 1) if el < 0: el = self.map.size() s = self.pos self.pos = el + 1 if self.map[s] == ' ': l = self.map[s + 1:el] if not self.noupdate: self.map[s] = '#' try: return cjson.decode(l) except Exception as ex: logging.warn('malformed line in %s at %d: %s', self.fn, s, l) continue raise StopIteration def __next_gzip(self): while 1: try: l = self.z.readline() except IOError as ex: # probably CRC error due to truncated file. discard the rest. # should we keep the file for later diagnosis? logging.error('error in %s: %s', self.fn, str(ex)) raise StopIteration if l == '': break if l[0] != ' ': continue try: return cjson.decode(l[1:]) except Exception as ex: logging.warn('malformed line in %s: %s', self.fn, l) continue raise StopIteration def next(self): if self.map is None: logging.warn("QueueFileReader:next called on closed file:%s", self.fn) raise StopIteration return self.__next()
if __name__ == '__main__': from sys import argv, stderr from gzip import GzipFile raw_vocab = argv[1] out_trans = argv[2] out_vocab = argv[3] fi = GzipFile(raw_vocab, "r") f = GzipFile(out_trans, "w") g = GzipFile(out_vocab, "w") addnewmapping("<*>", f, g) addnewmapping("<PP_UNK>", f, g) next_to_write = next_seen p = 0 while True: v = fi.readline() if not v: break key, value = v.strip().split("\t") nkey = transformation(key) nkeyid = wordno(nkey) addmapping(key, nkeyid, f) if nkeyid >= next_to_write: print >> g, "%s\t%d" % (nkey, nkeyid) next_to_write = nkeyid + 1 p += 1 if (p % 100000) == 0: print >> stderr, "added", p, key, nkey, nkeyid fi.close() f.close() g.close()
class LazyJsonReader(object): chunk_size = 2048 """Newline-separated json log reader tolerating massive log files""" def __init__(self, file_path, file_gzipped=False): self.gz = file_gzipped self.file = GzipFile(file_path, 'rb') if file_gzipped else open( file_path, 'rb') """ As bytes are read from the file a line count is kept. At any time we know: * the position of our pointer in the file's contents * what line number we are on So, it should be possible to fetch the previous/next line with some crafty seeking """ self.line = 0 def _get_position(self): """ Return set of (current_line, current_file_position) """ return (self.line, self.file.tell()) def _seek_to(self, line, pos): """ Seek to arbitrary locations. There's no logic here, this method assumes the line number and position specified are correct. """ self.line = line self.file.seek(pos) def decode(self, s): return s.decode('UTF-8') def read_next(self): """ Read the next line from the file, parse and return. Returns None if out of lines. """ data = self.file.readline().strip() if data: self.line += 1 return json.loads(self.decode(data)) if data else None def read_prev(self): """ Read the previous line from the file, parse and return. Returns None if out of lines. """ original_pos = current_pos = self.file.tell() # can't fall off the beginning if current_pos == 0: return None # rewind by chunk_size and read chunk_size bytes # repeat until we've found TWO \n - the end of the previous line, and the beginning of the line before the line we want # then split n grab #print(current_pos) rewound_chunk = b"" while rewound_chunk.count( b"\n") < 3: # changed from 2 to 3 to fix partial reads before_jump = current_pos # Jump backwards x bytes, and prevent falling off the start current_pos = max(0, current_pos - self.chunk_size) self.file.seek(current_pos) jumped_by = before_jump - current_pos # prepend the chunk to our buffer rewound_chunk = b''.join( [self.file.read(jumped_by), rewound_chunk]) #rewound_chunk = ''.join([rewound_chunk, '|||||', self.file.read(jumped_by)]) #print("Read ", jumped_by) # If we just read from the beginning of the file this loop should break regardless if current_pos == 0: break # we have a chunk containing at least one full line # find the last line in the chunk lines_split = rewound_chunk.split(b"\n") # -1 => blank # -2 => last line emitted # -3 => previous line. wont exist if we hit BOF # -4+ => line before that and/or partial line garbage if len(lines_split) < 3: self.line = 0 self.file.seek(0) return json.loads(self.decode(lines_split[0])) prev_line = lines_split[-2] # Calculate how far backwards we jumped, seek to the beginning of the line we're returning # TODO should it be elsewhere so if next_line is called we dont get this line again? after_prev_line = lines_split[-1:] rewound_len = len(b"\n".join([prev_line] + after_prev_line)) self.file.seek(original_pos - rewound_len) self.line -= 1 return json.loads(self.decode(prev_line))
print 'You have PhosphoSitePlus datasets; they will be added to the database.' print 'Finished fetching public data.' #### UNIPROT if 0: cur.execute("drop table if exists uniprot") cur.execute( "create table uniprot (u1 varchar(32) primary key, entry varchar(16), longname varchar(128), aaseq text, aalen int)" ) fi = GzipFile('data/uniprot/uniprot_sprot.fasta.gz') idline = None seq = '' while 1: l = fi.readline() if not l: i = idline.strip().split('|') u1 = i[1] entry, longname = i[2].split(' ', 1) if entry.endswith('_HUMAN'): longname = longname.split(' OS=')[0].replace("'", '') cur.execute( "insert into uniprot (u1, entry, longname, aaseq, aalen) values ('%s', '%s', '%s', '%s', %d)" % (u1, entry, longname, seq, len(seq))) break if l.startswith('>'): if not idline: idline = l seq = '' i = idline.strip().split('|')
def readline_NOcheckEnd(self, size=-1): line = GzipFile.readline(self, size) return unicode(line, 'latin_1').encode('utf_8')
if __name__ == '__main__': from sys import argv, stderr from gzip import GzipFile raw_vocab = argv[1] out_trans = argv[2] out_vocab = argv[3] fi = GzipFile( raw_vocab, "r" ) f = GzipFile( out_trans, "w" ) g = GzipFile( out_vocab, "w" ) addnewmapping( "<*>", f, g ) addnewmapping( "<PP_UNK>", f, g ) next_to_write = next_seen p = 0 while True: v = fi.readline() if not v: break key, value = v.strip().split( "\t" ) nkey = transformation( key ) nkeyid = wordno( nkey ) addmapping( key, nkeyid, f ) if nkeyid >= next_to_write: print >>g, "%s\t%d" % (nkey, nkeyid) next_to_write = nkeyid + 1 p += 1 if (p%100000) == 0: print >>stderr, "added", p, key, nkey, nkeyid fi.close() f.close() g.close()
import sys import os import codecs from datetime import datetime,timedelta output = codecs.open('kv7kalender.idx', 'w', 'UTF-8') lastupdated_filenames = {} kalender_filenames = [sys.argv[1] + '/' + x for x in os.listdir(sys.argv[1])] kalender_threshold = (datetime.now() - timedelta(days=3)).isoformat() for filename in sorted(kalender_filenames): f = GzipFile(filename, 'r') try: firstline = f.readline()[:-1] values = firstline.split('|') subscription = values[2] creationdate = values[7] if (creationdate < kalender_threshold): continue if subscription not in lastupdated_filenames: lastupdated_filenames[subscription] = {'filename' : filename, 'creationdate' : creationdate} elif creationdate > lastupdated_filenames[subscription]['creationdate']: lastupdated_filenames[subscription] = {'filename' : filename, 'creationdate' : creationdate} finally: f.close() for key, values in lastupdated_filenames.items(): print key + ' - ' + values['filename'] output.write(values['filename']+'\n')