def articles(wiki_json_fn, limit=None):
    count = 0

    _, ext = os.path.splitext(wiki_json_fn)

    if ext == '.gz':
        f = GzipFile(wiki_json_fn, mode='r')
    elif ext == '.bz2':
        f = BZ2File(wiki_json_fn, mode='r')
    else:
        f = io.open(wiki_json_fn, mode='rb')

    while True:
        line = f.readline()

        if line == b'':
            break

        action = json.loads(line.decode('utf-8'))

        line = f.readline()

        if line == b'':
            break

        source = json.loads(line.decode('utf-8'))

        if is_page(action, source):
            yield {'id': action['index']['_id'], 'title': source['title'], 'text': source['text']}

            count += 1

        if limit and count > limit:
            return

        if count % 10000 == 0:
            logging.info("read %d articles" % count)

    f.close()
示例#2
0
    def _parseSingleFile(self, logname):

        # get the log
        try:
            log, headers = urllib.urlretrieve(logname)
        except IOError:
            log = logname

        try:
            fp = GzipFile(log)
            fp.readline()
            fp.seek(0)
        except IOError:
            fp = open(log, "rb")

        # parse the log
        parser = self.parsers.get(
            self.harnessType, TinderboxParser)(includePass=self.includePass)
        lineno, results = parser.parse(fp)
        fp.close()

        # dump output
        results.update({'filename': os.path.basename(logname)})

        # create a sha1 hash to be this json's id
        m = hashlib.sha1()
        m.update(json.dumps(results))
        id = ""
        if 'starttime' in results:
            id += str(results['starttime']) + '-'
        id += m.hexdigest()
        results.update({'id': id})

        return results
示例#3
0
class Reader(object):
    """Reader class"""
    def __init__(self, wikidata_dump_fn, decoding="utf-8"):
        """
        
        :param wikidata_dump_fn: wikidata dump filename
        :param decoding: encoding used
        """
        self.wikidata_dump_fn = wikidata_dump_fn
        self.decoding = decoding
        self.dump = GzipFile(wikidata_dump_fn, 'r')
        self.line = self.dump.readline()

    def has_next(self):
        """
        Check if there is still entries to be read
        :return: true if other entries available
        """
        self.line = self.dump.readline().decode(self.decoding)
        if self.line is '': return False
        else: return True

    def next(self):
        """
        Return the next entry
        :return: next entry
        """
        try:
            return json.loads(self.line.strip('\n,'))
        except json.decoder.JSONDecodeError as e:
            return None
示例#4
0
def parse_file(log):
    print log
    try:
        fp = GzipFile(log)
        fp.readline()
        fp.seek(0)
    except IOError, e:
        fp = open(log, "rb")
示例#5
0
def parse_file(log):
    print log
    try:
        fp = GzipFile(log)
        fp.readline()
        fp.seek(0)
    except IOError, e:
        fp = open(log, "rb")
示例#6
0
def _cat(data, dups, fin, N, n, grep, encoding=None, filter_out=False):
    bom = fin.read(2)
    need_newline = True
    if bom == codecs.BOM_UTF16_LE:
        fin = codecs.EncodedFile(fin, 'utf-8', 'utf-16-le')
    elif bom == codecs.BOM_UTF16_BE:
        fin = codecs.EncodedFile(fin, 'utf-8', 'utf-16-be')
    elif bom == codecs.BOM_UTF32_LE:
        fin = codecs.EncodedFile(fin, 'utf-8', 'utf-32-le')
    elif bom == codecs.BOM_UTF32_BE:
        fin = codecs.EncodedFile(fin, 'utf-8', 'utf-32-be')
    elif bom == '\x1f\x8b':
        if N:
            raise ValueError('Tail is not supported for GZip files')
        fin.seek(0)
        fin = GzipFile(mode='r', fileobj=fin)
    elif encoding is not None:
        fin = codecs.EncodedFile(fin, 'utf-8', encoding)
        fin.seek(0)
        need_newline = False
    else:
        need_newline = False
        fin.seek(0)

    if need_newline:
        fin.readline()

    if N:
        data += tail(fin, N, grep, filter_out)
    elif grep or n:
        for line in fin:
            line = line.rstrip('\n')
            matches = None
            if grep:
                matches = grep.search(line)
            if not grep or (not filter_out and matches) or \
                (filter_out and not matches):
                if matches:
                    groups = matches.groups()
                    if groups:
                        record = '\t'.join(groups)
                        if record not in dups:
                            data.append(record)
                            dups.add(record)
                    else:
                        data.append(line)
                else:
                    data.append(line)

            if n and len(data) >= n:
                break
    else:
        block_size = 4 * 8192
        block = fin.read(block_size)
        if len(block) == block_size:
            block += "\n[FILE TRUNCATED, USE DOWNLOAD]"

        data.append(block)
示例#7
0
文件: utils.py 项目: StyXman/psync
def gunzip (gzFileName, fileName):
    inFile= GzipFile (gzFileName)
    outFile= open (fileName, "w+")

    line= inFile.readline ()
    while line:
        # print line
        outFile.write (line)
        line= inFile.readline ()

    inFile.close ()
    outFile.close ()
示例#8
0
    def test_group_events_together(self) -> None:
        buffer = MatchedEventsBuffer()
        buffer.add_event(
            EventMatch('id', 'version', 'log', 'dedup', {'key1': 'value1'}))
        buffer.add_event(
            EventMatch('id', 'version', 'log', 'dedup', {'key2': 'value2'}))

        self.assertEqual(len(buffer.data), 1)

        DDB_MOCK.update_item.return_value = {
            'Attributes': {
                'alertCount': {
                    'N': '1'
                }
            }
        }
        buffer.flush()

        DDB_MOCK.update_item.assert_called_once()
        S3_MOCK.put_object.assert_called_once()
        SNS_MOCK.publish.assert_called_once()

        _, call_args = S3_MOCK.put_object.call_args
        data = GzipFile(None, 'rb', fileobj=call_args['Body'])

        # Verify first event
        event1 = json.loads(data.readline().decode('utf-8'))
        self.assertIsNotNone(
            datetime.strptime(event1['p_alert_creation_time'],
                              '%Y-%m-%d %H:%M:%S.%f000'))
        self.assertIsNotNone(
            datetime.strptime(event1['p_alert_update_time'],
                              '%Y-%m-%d %H:%M:%S.%f000'))
        self.assertEqual(event1['p_rule_id'], 'id')
        self.assertEqual(event1['p_alert_id'], 'id-1')
        self.assertEqual(event1['key1'], 'value1')

        # Verify first event
        event2 = json.loads(data.readline().decode('utf-8'))
        self.assertIsNotNone(
            datetime.strptime(event2['p_alert_creation_time'],
                              '%Y-%m-%d %H:%M:%S.%f000'))
        self.assertIsNotNone(
            datetime.strptime(event2['p_alert_update_time'],
                              '%Y-%m-%d %H:%M:%S.%f000'))
        self.assertEqual(event2['p_rule_id'], 'id')
        self.assertEqual(event2['p_alert_id'], 'id-1')
        self.assertEqual(event2['key2'], 'value2')

        # Assert that the buffer has been cleared
        self.assertEqual(len(buffer.data), 0)
        self.assertEqual(buffer.bytes_in_memory, 0)
示例#9
0
文件: input.py 项目: benley/kye
    def __init__(self, playfile, playback):
        instream = GzipFile(playback)
        header = instream.readline().rstrip().decode()
        if not (header.startswith("Kye ") and header.endswith(" recording:")):
            raise KDemoFormatError()

        # Check filename in the demo is what we have loaded.
        fn = instream.readline().rstrip().decode()
        if fn != os.path.basename(playfile):
            raise KDemoFileMismatch(fn)

        # Okay
        self.__level = instream.readline().rstrip().decode()
        self.__rng = pickle.load(instream)
        self.__s: GzipFile = instream
示例#10
0
 def build_index_gzip(self):
     """creates sorted index from gzip-compressed queue.
     caches object regardless of caccheobj flag.
     """
     self.index = []
     zf = GzipFile(fileobj=self.map, mode="rb")
     while 1:
         p = zf.tell()  # just for diagnosis use
         try:
             l = zf.readline()
         except IOError as ex:
             # probably CRC error due to truncated file. discard the rest.
             logging.error("error in %s at %d: %s", self.fn, p, str(ex))
             break
         if not l:
             break
         if l[0] != " ":
             continue
         try:
             o = cjson.decode(l[1:])
         except Exception as ex:
             logging.warn("skipping malformed JSON at %s:%d: %s", self.fn, p, l[1:])
             continue
         key = o.get("id")
         if key is None:
             try:
                 key = self.urikey(o)
             except UnicodeEncodeError:
                 pass
             if key is None:
                 logging.error("urikey->None for %s", str(o))
                 continue
         self.index.append((key, o))
     zf.close()
示例#11
0
def getNewMrtgData(madeAdmUtilSnmpd):
    try:
        dbgOut = u" bootstrap: there was an error on mrtg data"
        dataFile = GzipFile("lib/python/org/ict_ok/admin_utils/snmpd/snmp_mrtg_data.gz", "rb")
        if dataFile.readline() == "## mrtg data file for ict_ok.org\n":
            timeStamp = float(dataFile.readline())
            all_templ_data = pickle.load(dataFile)
            dataFile.close()
            madeAdmUtilSnmpd.mrtg_data = copy.deepcopy(all_templ_data)
            madeAdmUtilSnmpd.mrtg_data_timestamp = timeStamp
            dbgOut = u" bootstrap: new mrtg data (%s) loaded" % \
                   (time.strftime("%Y-%m-%d %H:%M:%S +00",time.gmtime(timeStamp)))
    except ValueError:
        dbgOut = u" bootstrap: Hmm, format of mrtg data file incorrect"
    except IOError:
        dbgOut = u" bootstrap: Hmm, no mrtg data file"
    return dbgOut
def articles(wiki_json_fn, limit=None):
    count = 0

    _, ext = os.path.splitext(wiki_json_fn)

    if ext == '.gz':
        f = GzipFile(wiki_json_fn, mode='r')
    elif ext == '.bz2':
        f = BZ2File(wiki_json_fn, mode='r')
    else:
        f = io.open(wiki_json_fn, mode='rb')

    while True:
        line = f.readline()

        if line == b'':
            break

        action = json.loads(line.decode('utf-8'))

        line = f.readline()

        if line == b'':
            break

        source = json.loads(line.decode('utf-8'))

        if is_page(action, source):
            yield {
                'id': action['index']['_id'],
                'title': source['title'],
                'text': source['text']
            }

            count += 1

        if limit and count > limit:
            return

        if count % 10000 == 0:
            logging.info("read %d articles" % count)

    f.close()
示例#13
0
    def _loadFeatureTable(self):

        filename = self.dirname + FEATURE_FILENAME

        f = GzipFile(filename)

        # discard first line
        f.readline()

        while True:
            line = f.readline().rstrip()
            if line == '': break

            (feat, n) = self._parseHashTableLine(line)

            self.emb.featTable[feat] = n
            self.emb.rFeatTable[n] = feat

        f.close()
    def _loadFeatureTable(self):
        
        filename = self.dirname + FEATURE_FILENAME
        if not os.path.exists(filename):
            return

        f  = GzipFile(filename)
        
        # discard first line
        f.readline()

        while True:
            line = f.readline().rstrip()
            if line == '': break
            
            (feat, n) = self._parseHashTableLine(line)
            
            self.emb.featTable[feat] = n
            self.emb.rFeatTable[n] = feat
            
        f.close()
def is_athena_project(filename):
    """tests whether file is a valid Athena Project file"""
    result = False
    if os.path.exists(filename):
        try:
            fh = GzipFile(filename)
            line1 = bytes2str(fh.readline())
            result = "Athena project file -- Demeter version" in line1
        except:
            pass
        finally:
            fh.close()
    return result
示例#16
0
def is_athena_project(filename):
    """tests whether file is a valid Athena Project file"""
    result = False
    if os.path.exists(filename):
        try:
            fh = GzipFile(filename)
            line1 = bytes2str(fh.readline())
            result = "Athena project file -- Demeter version" in line1
        except:
            pass
        finally:
            fh.close()
    return result
示例#17
0
class PathListRead_gzip(object):
    def __init__(self, f):
        self.F = f
        self.G = GzipFile(fileobj=f, mode="rb")

    def read(self):
        done = False
        while not done:
            line = self.G.readline()
            if not line: return None  # EOF
            path = line.strip()
            return path

    def paths(self):
        done = False
        while not done:
            path = self.read()
            if not path: break
            yield path
示例#18
0
 def getFileObjects(self):
     inputs = []
     for f in self.files:
         if f:
             fileid, fd, offset,size = f    
             fd.seek(offset,0)
             content = BytesIO(fd.read(size))
             input = GzipFile(fileid,'rb',fileobj=content)
             try:
                 firstline = input.readline().decode("utf-8","ignore")
                 if self.subformat == "ssa":
                     input = self.convertFromSsa(input)
                 elif self.subformat != "srt":
                     input = self.convertFromSub(input)
                 elif re.match("\{\d+\}\{\d+\}", firstline):
                     input = self.convertFromSub(input)
                 input.seek(0)
             except:
                 sys.stderr.write("Conversion problem: %s\n"%sys.exc_info()[1])
                 continue
             
             # special case for Georgian (Python does not include the georgian-ps encoding)
             if self.langcode=="ka":
                 inputtext = input.read()
                 import chardet
                 if "utf-8" not in chardet.detect(inputtext)["encoding"].lower():
                     sys.stderr.write("Converting Georgian subtitle to UTF-8\n")
                     p = subprocess.Popen("iconv -f georgian-ps -t utf-8", shell=True,
                                     stdin=subprocess.PIPE,stdout=subprocess.PIPE)
                     out, err = p.communicate(inputtext)
                     input = BytesIO(out)
                 else:
                     input = BytesIO(inputtext)
                     
             inputs.append(input) 
     return inputs
示例#19
0
class ezfio_obj(object):

  def __init__(self,read_only=False):
    self._filename = "EZFIO_File"
    self.buffer_rank = -1
    self.read_only = read_only
    self.locks = {}
  
  def acquire_lock(self,var):
    locks = self.locks
    try:
      locks[var].acquire()
    except:
      locks[var] = threading.Lock()
      locks[var].acquire()

  def release_lock(self,var):
    self.locks[var].release()

  def set_read_only(self,v):
    self.read_only = v

  def get_read_only(self):
    return self.read_only

  def exists(self,path):
    if os.access(path+'/.version',os.F_OK) == 1:
      file = open(path+'/.version',"r")
      v = file.readline().strip()
      file.close()
    else:
      return False

  def mkdir(self,path):
    if self.read_only:
      self.error('Read-only file.')
    if self.exists(path):
      self.error('mkdir','Group '+path+' exists')
    try:
      os.mkdir(path.strip())
    except OSError:
      pass
    file = open(path.strip()+'/.version','w')
    print >>file,self.version
    file.close()

  def error(self,where,txt):
    print '------------------------------------------------------------'
    print 'EZFIO File     : '+self.filename
    print 'EZFIO Error in : '+where.strip()
    print '------------------------------------------------------------'
    print ''
    print txt.strip()
    print ''
    print '------------------------------------------------------------'
    raise IOError

  def get_filename(self):
    if not self.exists(self._filename):
      self.mkdir(self._filename)
    return self._filename

  def set_filename(self,filename):
    self._filename = filename

  filename = property(fset=set_filename,fget=get_filename)

  def set_file(self,filename):
    self.filename = filename
    if not self.exists(filename):
      self.mkdir(filename)
      self.mkdir(filename+"/ezfio")
      os.system("""
LANG= date > %s/ezfio/creation
echo $USER > %s/ezfio/user
echo %s > %s/ezfio/library"""%(filename,filename,self.LIBRARY,filename))

  def open_write_buffer(self,dir,fil,rank):
    if self.read_only:
      self.error('Read-only file.')
    l_filename=dir.strip()+'/'+fil+'.gz'
    if self.buffer_rank != -1:
      self.error('open_write_buffer','Another buffered file is already open.')

    self.buffer_rank = rank
    assert (self.buffer_rank > 0)

    try:
      self.file = GzipFile(filename=l_filename,mode='wb7')
    except IOError:
      self.error('open_write_buffer','Unable to open buffered file.')

    self.file.write("%2d\n"%(rank,))


  def open_read_buffer(self,dir,fil,rank):
    l_filename=dir.strip()+'/'+fil+'.gz'

    if self.buffer_rank != -1:
      self.error('open_read_buffer','Another buffered file is already open.')

    try:
      self.file = GzipFile(filename=l_filename,mode='rb')
    except IOError:
      self.error('open_read_buffer','Unable to open buffered file.')

    try:
      rank = eval(self.file.readline())
    except IOError:
      self.error('open_read_buffer','Unable to read buffered file.')

    self.buffer_rank = rank
    assert (self.buffer_rank > 0)
    return rank

  def close_buffer(self):
    assert (self.buffer_rank > 0)
    self.buffer_rank = -1
    self.file.close()

  def read_buffer(self,isize):

    if self.buffer_rank == -1:
      self.error('read_buffer','No buffered file is open.')

    indices = []
    values = []
    for i in xrange(isize):
      try:
        line = self.file.readline().split()
      except:
        return indices, values
      if len(line) == 0:
        return indices, values
      indices.append ( [ int(i) for i in line[:-1] ] )
      values.append (eval(line[-1]))
    return indices, values

  def write_buffer(self,indices,values,isize):
    if self.read_only:
      self.error('Read-only file.')
    if self.buffer_rank == -1:
      self.error('write_buffer','No buffered file is open.')

    for i in xrange(isize):
     for j in indices[i]:
       self.file.write("%4d "%(j,))
     self.file.write("%24.15e\n"%(values[i],))
示例#20
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Log-Bilinear model for relation extraction.')
    _arg = parser.add_argument
    _arg('--read-dump',
         type=str,
         action='store',
         metavar='PATH',
         help='Reads in a wikidata json dump.')
    args = parser.parse_args()

    train_set = None
    if args.read_dump:
        dump_in = GzipFile(args.read_dump, 'r')
        line = dump_in.readline()
        iter = 0
        while line != '':
            iter += 1
            line = dump_in.readline()
            try:
                ent = json.loads(line.rstrip('\n,'))
                if not ent['id'].startswith('Q'):
                    print("Skipping item with id {}".format(ent['id']),
                          file=sys.stderr)
                    continue
                print('\n'.join(
                    ['{}\t{}\t{}'.format(*t) for t in to_triplets(ent)]),
                      file=sys.stdout)
            except (KeyError, ValueError) as e:
                print(e, file=sys.stderr)
示例#21
0
文件: lazyjson.py 项目: dpedu2/sless
class LazyJsonReader(object):

    chunk_size = 2048

    """Newline-separated json log reader tolerating massive log files"""
    def __init__(self, file_path, file_gzipped=False):
        self.gz = file_gzipped
        self.file = GzipFile(file_path, 'rb') if file_gzipped else open(file_path, 'rb')
        """
        As bytes are read from the file a line count is kept. At any time we know:
        * the position of our pointer in the file's contents
        * what line number we are on
        So, it should be possible to fetch the previous/next line with some crafty seeking
        """
        self.line = 0

    def _get_position(self):
        """
        Return set of (current_line, current_file_position)
        """
        return (self.line, self.file.tell())

    def _seek_to(self, line, pos):
        """
        Seek to arbitrary locations. There's no logic here, this method assumes the line number and position specified
        are correct.
        """
        self.line = line
        self.file.seek(pos)

    def decode(self, s):
        return s.decode('UTF-8')

    def read_next(self):
        """
        Read the next line from the file, parse and return. Returns None if out of lines.
        """
        data = self.file.readline().strip()
        if data:
            self.line += 1
        return json.loads(self.decode(data)) if data else None

    def read_prev(self):
        """
        Read the previous line from the file, parse and return. Returns None if out of lines.
        """
        original_pos = current_pos = self.file.tell()

        # can't fall off the beginning
        if current_pos == 0:
            return None

        # rewind by chunk_size and read chunk_size bytes
        # repeat until we've found TWO \n - the end of the previous line, and the beginning of the line before the line we want
        # then split n grab
        #print(current_pos)
        rewound_chunk = b""
        while rewound_chunk.count(b"\n") < 3: # changed from 2 to 3 to fix partial reads
            before_jump = current_pos

            # Jump backwards x bytes, and prevent falling off the start
            current_pos = max(0, current_pos-self.chunk_size)
            self.file.seek(current_pos)
            jumped_by = before_jump-current_pos

            # prepend the chunk to our buffer
            rewound_chunk = b''.join([self.file.read(jumped_by), rewound_chunk])
            #rewound_chunk = ''.join([rewound_chunk, '|||||', self.file.read(jumped_by)])
            #print("Read ", jumped_by)

            # If we just read from the beginning of the file this loop should break regardless
            if current_pos == 0:
                break

        # we have a chunk containing at least one full line
        # find the last line in the chunk
        lines_split = rewound_chunk.split(b"\n")

        # -1 => blank
        # -2 => last line emitted
        # -3 => previous line. wont exist if we hit BOF
        # -4+ => line before that and/or partial line garbage
        if len(lines_split) < 3:
            self.line = 0
            self.file.seek(0)
            return json.loads(self.decode(lines_split[0]))
        prev_line = lines_split[-2]

        # Calculate how far backwards we jumped, seek to the beginning of the line we're returning
        # TODO should it be elsewhere so if next_line is called we dont get this line again?
        after_prev_line = lines_split[-1:]
        rewound_len = len(b"\n".join([prev_line] + after_prev_line))
        self.file.seek(original_pos - rewound_len)
        self.line -= 1

        return json.loads(self.decode(prev_line))
示例#22
0
class TRJ_reader(abstract_trajectory_reader):
    """Read LAMMPS trajectory file

    This is a naive (and comparatively slow) implementation,
    written entirely in python.
    """
    @classmethod
    def reader_available(cls):
        return True

    def __init__(self, filename, x_factor=0.1, t_factor=1.0):
        if filename.endswith('.gz'):
            from gzip import GzipFile
            self._fh = GzipFile(filename, 'r')
        elif filename.endswith('.bz2'):
            from bz2 import BZ2File
            self._fh = BZ2File(filename, 'r')
        else:
            self._fh = open(filename, 'r')

        self._open = True
        self._item_re = \
            re.compile(r'^ITEM: (TIMESTEP|NUMBER OF ATOMS|BOX BOUNDS|ATOMS) ?(.*)$')
        self.x_factor = x_factor
        self.t_factor = t_factor
        self.v_factor = x_factor / t_factor
        self._first_called = False
        self._index = count(1)

    # ITEM: TIMESTEP
    # 81000
    # ITEM: NUMBER OF ATOMS
    # 1536
    # ITEM: BOX BOUNDS pp pp pp
    # 1.54223 26.5378
    # 1.54223 26.5378
    # 1.54223 26.5378
    # ITEM: ATOMS id type x y z vx vy vz
    # 247 1 3.69544 2.56202 3.27701 0.00433856 -0.00099307 -0.00486166
    # 249 2 3.73324 3.05962 4.14359 0.00346029 0.00332502 -0.00731005
    # 463 1 3.5465 4.12841 5.34888 0.000523332 0.00145597 -0.00418675

    def _read_frame_header(self):
        while True:
            L = self._fh.readline()
            m = self._item_re.match(L)
            if not m:
                if L == '':
                    self._fh.close()
                    self._open = False
                    raise StopIteration
                if L.strip() == '':
                    continue
                raise IOError(
                    "TRJ_reader: Failed to read/parse TRJ frame header")
            if m.group(1) == "TIMESTEP":
                step = int(self._fh.readline())
            elif m.group(1) == "NUMBER OF ATOMS":
                natoms = int(self._fh.readline())
            elif m.group(1) == "BOX BOUNDS":
                bbounds = [
                    map(float,
                        self._fh.readline().split()) for _ in range(3)
                ]
                x = array(bbounds)
                box = np.diag(x[:, 1] - x[:, 0])
                if x.shape == (3, 3):
                    box[1, 0] = x[0, 2]
                    box[2, 0] = x[1, 2]
                    box[2, 1] = x[2, 2]
                elif x.shape != (3, 2):
                    raise IOError(
                        'TRJ_reader: Malformed box bounds in TRJ frame header')
            elif m.group(1) == "ATOMS":
                cols = tuple(m.group(2).split())
                # At this point, there should be only atomic data left
                return (step, natoms, box, cols)

    def _get_first(self):
        # Read first frame, update state of self, create indexes etc
        step, N, box, cols = self._read_frame_header()
        self._natoms = N
        self._step = step
        self._cols = cols
        self._box = box

        def _all_in_cols(keys):
            for k in keys:
                if not k in cols:
                    return False
            return True

        self._x_map = None
        if _all_in_cols(('id', 'xu', 'yu', 'zu')):
            self._x_I = array(map(cols.index, ('xu', 'yu', 'zu')))
        elif _all_in_cols(('id', 'x', 'y', 'z')):
            self._x_I = array(map(cols.index, ('x', 'y', 'z')))
        elif _all_in_cols(('id', 'xs', 'ys', 'zs')):
            self._x_I = array(map(cols.index, ('xs', 'ys', 'zs')))
            _x_factor = self._box.diagonal().reshape((3, 1))
            # xs.shape == (3,n)
            self._x_map = lambda xs: xs * _x_factor
        else:
            raise RuntimeError('TRJ file must contain at least atom-id, x, y, '
                               'and z coordinates to be useful.')
        self._id_I = cols.index('id')

        if _all_in_cols(('vx', 'vy', 'vz')):
            self._v_I = array(map(cols.index, ('vx', 'vy', 'vz')))
        else:
            self._v_I = None

        if 'type' in cols:
            self._type_I = cols.index('type')
        else:
            self._type_I = None

        data = array(
            [map(float,
                 self._fh.readline().split()) for _ in range(N)])
        I = np.asarray(data[:, self._id_I], dtype=np.int)
        # Unless dump is done for group "all" ...
        I[np.argsort(I)] = arange(len(I))
        self._x = zeros((3, N), order='F')
        if self._x_map is None:
            self._x[:, I] = data[:, self._x_I].transpose()
        else:
            self._x[:, I] = self._x_map(data[:, self._x_I].transpose())
        if self._v_I is not None:
            self._v = zeros((3, N), order='F')
            self._v[:, I] = data[:, self._v_I].transpose()

    def _get_next(self):
        # get next frame, update state of self
        step, N, box, cols = self._read_frame_header()
        assert (self._natoms == N)
        assert (self._cols == cols)
        self._step = step
        self._box = box

        data = array(
            [map(float,
                 self._fh.readline().split()) for _ in range(N)])
        I = np.asarray(data[:, self._id_I], dtype=np.int) - 1
        if self._x_map is None:
            self._x[:, I] = data[:, self._x_I].transpose()
        else:
            self._x[:, I] = self._x_map(data[:, self._x_I].transpose())
        if self._v_I is not None:
            self._v[:, I] = data[:, self._v_I].transpose()

    def __iter__(self):
        return self

    def close(self):
        if not self._fh.closed:
            self._fh.close()

    def next(self):
        if not self._open:
            raise StopIteration

        if self._first_called:
            self._get_next()
        else:
            self._get_first()

        res = dict(
            index=self._index.next(),
            N=int(self._natoms),
            box=self.x_factor * self._box.copy('F'),
            time=self.t_factor * self._step,
            x=self.x_factor * self._x,
        )

        if self._v_I is not None:
            res['v'] = self.v_factor * self._v

        return res
示例#23
0
class QueueFileReader(object):
    '''reads (dequeues) from single queue file'''
    def __init__(self, qfile, noupdate=False):
        self.fn = qfile
        self.noupdate = noupdate
        self.map = None
        self.open()
    def open(self):
        fd = os.open(self.fn, os.O_RDWR)
        self.pos = 0
        sig = os.read(fd, 2)
        # check for gzip signature
        if sig == '\x1f\x8b':
            # there's no added benefit to mmap gzip file, I guess
            os.lseek(fd, 0, 0)
            self.z = GzipFile(fileobj=os.fdopen(fd), mode='rb')
            self.__next = self.__next_gzip
        else:
            self.z = None
            self.map = mmap.mmap(fd, 0, access=mmap.ACCESS_WRITE)
            # mmap dups fd, fd need not be kept open.
            os.close(fd)
            self.__next = self.__next_mmap
    def close(self):
        if self.z:
            self.z.close()
            self.z = None
        if self.map:
            self.map.close()
            self.map = None
    def __next_mmap(self):
        if self.map is None:
            logging.warn("QueueFileReader:next called on closed file:%s",
                         self.fn)
            raise StopIteration
        while self.pos < self.map.size():
            el = self.map.find('\n', self.pos + 1)
            if el < 0: el = self.map.size()
            s = self.pos
            self.pos = el + 1
            if self.map[s] == ' ':
                l = self.map[s + 1:el]
                if not self.noupdate:
                    self.map[s] = '#'
                try:
                    return json.loads(l)
                except Exception as ex:
                    logging.warn('malformed line in %s at %d: %s', self.fn,
                                 s, l)
                    continue
        raise StopIteration

    def __next_gzip(self):
        while 1:
            try:
                l = self.z.readline()
            except Exception as ex:
                # probably CRC error due to truncated file. discard the rest.
                # should we keep the file for later diagnosis? we can get
                # IOError from gzip, as well as zlib.error for lower level
                # problems.
                logging.error('error in %s: %s', self.fn, str(ex))
                raise StopIteration
            if l == '': break
            if l[0] != ' ': continue
            try:
                return json.loads(l[1:])
            except Exception as ex:
                logging.warn('malformed line in %s: %s', self.fn, l)
                continue
        raise StopIteration
                             
    def next(self):
        return self.__next()
# FOR EACH SAMPLE, READ IN MATCHES. CATALOG LOCUS IS [2], SAMPLE LOCUS IS [4], COV IS [6]
# ADD COVERAGE TO CORRECT INDEX FOR THE KEY == CATALOG LOCUS

for i in range(nsamples):
    sample  = samples[i]
    if gzipped == 1:
        matches = stacks+"/"+sample+".matches.tsv.gz"
        sys.stderr.write("Parsing "+sample+".matches.tsv.gz...\n")
        logfile.write("Parsing "+sample+".matches.tsv.gz...\n")
        matches = GzipFile(matches, 'r')
    else:
        matches = stacks+"/"+sample+".matches.tsv"
        sys.stderr.write("Parsing "+sample+".matches.tsv...\n")
        logfile.write("Parsing "+sample+".matches.tsv...\n")
        matches = open(matches, 'r')
    matches.readline()
    nmatches = 0
    for match in matches:
        match = match.strip('\n').split('\t')
        clocus = match[2]
        slocus = match[4]
        coverage = int(match[6])
        if clocus in locusCoverage:
            nmatches += 1
            locusCoverage[clocus][i] += coverage
            if nmatches % 1000 == 0:
                sys.stderr.write("Matches found (incl. alt. alleles):\t%s\r"%(nmatches))
    matches.close()
    sys.stderr.write("Matches found (incl. alt. alleles):\t%s\n"%(nmatches))
    logfile.write("Matches found (incl. alt. alleles):\t%s\n"%(nmatches))
示例#25
0
文件: io.py 项目: ahvigil/picklr
class Rdata(object):
    def __init__(self, name, mode='rb', buffering=1):
        self._file=None
        self._format=None
        self._compression=NO_COMPRESSION
        # file version info
        self._version = None
        self._rversion = None
        self._min_rversion = None

        self.open(name, mode, buffering)
        self.readHeader()

    def open(self, name, mode, buffering):
        # Open as ordinary binary file and check type
        # reopen as necessary using required class
        self._file = open(name, mode, buffering)
        magic = self._file.read(3)
        # gzip
        if magic[0] == '\x1f' and magic[1] == '\x8b' and magic[2] == '\x08':
            self._file.close()
            self._file = GzipFile(name, mode, buffering)
        # bz2
        elif magic[0]=='\x42' and magic[1]=='\x5a' and magic[2]=='\x68':
            self._compression=BZIP2_COMPRESSION
            self._file.close()
            self._file = BZ2File(name, mode, buffering)
        # xz
        elif magic[0]=='\xfd' and magic[1]=='\x37' and magic[2]=='\x7a':
            raise NotImplementedError("xz compression not supported")
        # no compression, read from file as-is starting from beginning
        else:
            self._file.seek(0)

    # read n bytes from file
    def read(self, n=1):
        return self._file.read(n)

    # Read header for format and file version
    def readHeader(self):
        magic = self._file.readline().strip()
        # check magic number
        # TODO: implement multiple file versions
        if magic in ["RDX2"]:
            self._format=XDR_FILE

        # version 1 ASCII, binary, xdr
        elif magic in ["RDA1", "RDB1", "RDX1"]:
            self._version = 1
            raise NotImplementedError("Version 1 saves not yet supported")

        # version 2 ASCII, binary
        elif magic in ["RDA2", "RDB2"]:
            self._version=2
            raise NotImplementedError("Version 2 non-XDR saves not yet supported")

        elif magic[:2] == "RD":
            # magic number looks legit but did not match any known types
            raise RuntimeError("Unknown save version found")

        else:
            # this doesn't even look like an R save file
            raise RuntimeError("Unknown file format- are you sure this is an R save file?")

        ftype = self._file.readline().strip()
        #assert ftype == 'X'

        self._version = self.getInteger()
        self._rversion = "%d.%d.%d" % self.decodeVersion(self.getInteger())
        self._min_rversion = "%d.%d.%d" % self.decodeVersion(self.getInteger())

        if self._version is not 2:
            raise RuntimeException("Only version 2 saves supported")

    def close(self):
        self._file.close()

    def getLength(self):
        length = self.getInteger()

        # very long arrays might need 64 bits to store length
        if length is -1:
            len1 = long( self.getInteger() )
            len2 = long( self.getInteger() )
            length = ( len1 << 32 ) + len2

        return length

    def getInteger(self):
        """
        Parse the next 4 bytes in the stream as an integer
        """
        if self._format == XDR_FILE:
            return xdrlib.Unpacker(self._file.read(R_XDR_INTEGER_SIZE)).unpack_int()

    #  serialize.c contains UnpackFlags(...)
    def getFlags(self):
        """
        Parse and return SEXP flags as in Serialize.c:UnpackFlags
        Takes in an integer containing object flags
        Returns a list of 5 items
        """
        flags = self.getInteger()

        ptype = flags & 255
        plevs = flags >> 12
        pisobj = True if flags & (1 << 8) else False
        phasattr = True if flags & (1 << 9) else False
        phastag = True if flags & (1 << 10) else False

        log.info("%s %s %s %s", SEXP_TYPES[ptype], pisobj, phasattr, phastag)

        return (ptype, plevs, pisobj, phasattr, phastag)

    def getIntegerVec(self):
        """
        Read a vector of integers
        """
        length = self.getLength()
        unpacker = xdrlib.Unpacker(self.read(length*R_XDR_INTEGER_SIZE))
        return unpacker.unpack_farray(length, unpacker.unpack_int)

    def getRealVec(self):
        """
        Get a vector of real numbers from the input stream, assuming
        numbers are stored according to XDR double floating point standard
        """
        length = self.getLength()

        unpacker = xdrlib.Unpacker(self._file.read(length*SIZEOF_DOUBLE))
        return unpacker.unpack_farray(length, unpacker.unpack_double)

    def getComplex(stream):
        raise NotImplementedError("Complex values not yet implemented")

    def getComplexVec(stream, length):
        raise NotImplementedError("Complex values not yet implemented")

    def getChar(self, n = 1):
        """
        Get n characters from the stream and return them as a string
        """
        if n<0: return "NA"
        else: return self._file.read(n)

    def getString(self):
        length = self.getInteger() # null terminated strings
        log.info("length = %d", length)
        string = self.getChar(length)
        log.indent();log.debug("'%s'", string);log.dedent()

        return string

    def decodeVersion(self, packed):
        """
        Decode packed version number into a human readable format
        """
        v = packed / 65536
        packed %= 65536
        p = packed / 256
        packed %= 256
        s = packed
        return (v, p, s)
示例#26
0
      sys.exit(1)
      
  
  fd = GzipFile(args[0], 'r')
 
  #aux buffer for new modified file
  output=""   
  #aux dict with keys src_dst combination and its corresponding line of the file as value.
  src_dst_dic={}
  #aux boolean dict with keys src_dst combination. 0=no response from DST yet, 1=response from DST for the last src_dst message.
  response={}
  #adjustable step time for messages in the same session
  step = float(args[1])
  
  while 1:
      line = fd.readline()
      #if script gets to the end of file and line is empty, break the loop
      if line == "": break
      line = line.split(' ')
      src_dst = line[2]+"_"+line[3]
      dst_src = line[3]+"_"+line[2]
      line[4] = (' '.join(str(n) for n in line[4:])).replace("\n","")
      line = line[0:5]
      
      #check if there's an entry for this combination of SRC and DST
      if src_dst in src_dst_dic:
          
          if response[src_dst] == 0 and float(line[0]) - float(src_dst_dic[src_dst][0]) < step:
              src_dst_dic[src_dst][4] += line[4]
          else:
              output += ' '.join(src_dst_dic[src_dst])+"\n"
示例#27
0
 def readline_checkEnd(self, size=-1):
     line = GzipFile.readline(self, size)
     if self.stop is not None and line[:self.stoplen] == self.stop: return ''
     return line
示例#28
0
class TRJ_reader(abstract_trajectory_reader):
    """Read LAMMPS trajectory file

    This is a naive (and comparatively slow) implementation,
    written entirely in python.
    """

    @classmethod
    def reader_available(cls):
        return True

    def __init__(self, filename, x_factor=0.1, t_factor=1.0):
        if filename.endswith('.gz'):
            from gzip import GzipFile
            self._fh = GzipFile(filename, 'r')
        elif filename.endswith('.bz2'):
            from bz2 import BZ2File
            self._fh = BZ2File(filename, 'r')
        else:
            self._fh = open(filename,'r')

        self._open = True
        self._item_re = \
            re.compile(r'^ITEM: (TIMESTEP|NUMBER OF ATOMS|BOX BOUNDS|ATOMS) ?(.*)$')
        self.x_factor = x_factor
        self.t_factor = t_factor
        self.v_factor = x_factor/t_factor
        self._first_called = False
        self._index = count(1)

    # ITEM: TIMESTEP
    # 81000
    # ITEM: NUMBER OF ATOMS
    # 1536
    # ITEM: BOX BOUNDS pp pp pp
    # 1.54223 26.5378
    # 1.54223 26.5378
    # 1.54223 26.5378
    # ITEM: ATOMS id type x y z vx vy vz
    # 247 1 3.69544 2.56202 3.27701 0.00433856 -0.00099307 -0.00486166
    # 249 2 3.73324 3.05962 4.14359 0.00346029 0.00332502 -0.00731005
    # 463 1 3.5465 4.12841 5.34888 0.000523332 0.00145597 -0.00418675

    def _read_frame_header(self):
        while True:
            L = self._fh.readline()
            m = self._item_re.match(L)
            if not m:
                if L == '':
                    self._fh.close()
                    self._open = False
                    raise StopIteration
                if L.strip() == '':
                    continue
                raise IOError("TRJ_reader: Failed to read/parse TRJ frame header")
            if m.group(1) == "TIMESTEP":
                step = int(self._fh.readline())
            elif m.group(1) == "NUMBER OF ATOMS":
                natoms = int(self._fh.readline())
            elif m.group(1) == "BOX BOUNDS":
                bbounds = [map(float, self._fh.readline().split())
                           for _ in range(3)]
                x = array(bbounds)
                box = np.diag(x[:,1]-x[:,0])
                if x.shape == (3,3):
                    box[1,0] = x[0,2]
                    box[2,0] = x[1,2]
                    box[2,1] = x[2,2]
                elif x.shape != (3,2):
                    raise IOError('TRJ_reader: Malformed box bounds in TRJ frame header')
            elif m.group(1) == "ATOMS":
                cols = tuple(m.group(2).split())
                # At this point, there should be only atomic data left
                return (step, natoms, box, cols)

    def _get_first(self):
        # Read first frame, update state of self, create indexes etc
        step, N, box, cols = self._read_frame_header()
        self._natoms = N
        self._step = step
        self._cols = cols
        self._box = box

        def _all_in_cols(keys):
            for k in keys:
                if not k in cols:
                    return False
            return True

        self._x_map = None
        if _all_in_cols(('id','xu','yu','zu')):
            self._x_I = array(map(cols.index, ('xu','yu','zu')))
        elif _all_in_cols(('id','x','y','z')):
            self._x_I = array(map(cols.index, ('x','y','z')))
        elif _all_in_cols(('id','xs','ys','zs')):
            self._x_I = array(map(cols.index, ('xs','ys','zs')))
            _x_factor = self._box.diagonal().reshape((3,1))
            # xs.shape == (3,n)
            self._x_map = lambda xs : xs * _x_factor
        else:
            raise RuntimeError('TRJ file must contain at least atom-id, x, y, '
                               'and z coordinates to be useful.')
        self._id_I = cols.index('id')

        if _all_in_cols(('vx','vy','vz')):
            self._v_I = array(map(cols.index, ('vx','vy','vz')))
        else:
            self._v_I = None

        if 'type' in cols:
            self._type_I = cols.index('type')
        else:
            self._type_I = None

        data = array([map(float, self._fh.readline().split())
                         for _ in range(N)])
        I = np.asarray(data[:,self._id_I], dtype=np.int)
        # Unless dump is done for group "all" ...
        I[np.argsort(I)] = arange(len(I))
        self._x = zeros((3,N), order='F')
        if self._x_map is None:
            self._x[:,I] = data[:,self._x_I].transpose()
        else:
            self._x[:,I] = self._x_map(data[:,self._x_I].transpose())
        if self._v_I is not None:
            self._v = zeros((3,N), order='F')
            self._v[:,I] = data[:,self._v_I].transpose()

    def _get_next(self):
        # get next frame, update state of self
        step, N, box, cols = self._read_frame_header()
        assert(self._natoms == N)
        assert(self._cols == cols)
        self._step = step
        self._box = box

        data = array([map(float, self._fh.readline().split())
                         for _ in range(N)])
        I = np.asarray(data[:,self._id_I], dtype=np.int)-1
        if self._x_map is None:
            self._x[:,I] = data[:,self._x_I].transpose()
        else:
            self._x[:,I] = self._x_map(data[:,self._x_I].transpose())
        if self._v_I is not None:
            self._v[:,I] = data[:,self._v_I].transpose()

    def __iter__(self):
        return self

    def close(self):
        if not self._fh.closed:
            self._fh.close()

    def next(self):
        if not self._open:
            raise StopIteration

        if self._first_called:
            self._get_next()
        else:
            self._get_first()

        res = dict(
            index = self._index.next(),
            N = int(self._natoms),
            box = self.x_factor*self._box.copy('F'),
            time = self.t_factor*self._step,
            x = self.x_factor*self._x,
            )

        if self._v_I is not None:
            res['v'] = self.v_factor*self._v

        return res
示例#29
0
      sys.exit(1)
      
  
  fd = GzipFile(args[0], 'r')
 
  #aux buffer for new modified file
  output=""   
  #aux dict with keys src_dst combination and its corresponding line of the file as value.
  src_dst_dic={}
  #aux boolean dict with keys src_dst combination. 0=no response from DST yet, 1=response from DST for the last src_dst message.
  response={}
  #adjustable step time for messages in the same session
  step = float(args[1])
  
  while 1:
      line = fd.readline()
      #if script gets to the end of file and line is empty, break the loop
      if line == "": break
      line = line.split(' ')
      src_dst = line[2]+"_"+line[3]
      dst_src = line[3]+"_"+line[2]
      line[4] = (' '.join(str(n) for n in line[4:])).replace("\n","")
      line = line[0:5]
      
      #check if there's an entry for this combination of SRC and DST
      if src_dst_dic.has_key(src_dst):
          
          if response[src_dst] == 0 and float(line[0]) - float(src_dst_dic[src_dst][0]) < step:
              src_dst_dic[src_dst][4] += line[4]
          else:
              output += ' '.join(src_dst_dic[src_dst])+"\n"
示例#30
0
 def readline_checkEnd(self, size=-1):
     line = GzipFile.readline(self, size)
     if self.stop is not None and line[:self.stoplen] == self.stop:
         return ''
     return line
示例#31
0
文件: recovery.py 项目: tussock/Vault
    def recover_one(self, folder, password, destination):
        log("recover_one", folder, password, destination)
        save_cwd = os.getcwd()
        os.chdir(folder)
        encrypted = len(password) > 0
        tmp_dir = tempfile.mkdtemp()
        try:
            ############
            #
            #    PROCESSING THE TAR FILE
            #
            ############
            if encrypted:
                log("Password required")
                pass_file = os.path.join(tmp_dir, "pwd")
                os.mkfifo(pass_file, 0600)

                cat = Popen('find data -type f -print | sort | xargs cat', shell=True, stdout=PIPE)
                openssl = Popen("/usr/bin/openssl enc -d -aes256 -md sha256 -pass 'file:%s'" % pass_file, shell=True,
                                stdin=cat.stdout, stdout=PIPE)
                tar = Popen("/bin/tar -xzf - --directory '%s'" % (destination,), shell=True,
                            stdin=openssl.stdout, stdout=PIPE, stderr=PIPE)

                #    Send the passphrase via a pipe file. 
                #    It wont appear in the process list (via cmd line arg).
                tmp_fd = open(pass_file, "w")
                tmp_fd.write(password)
                tmp_fd.close()
            else:
                log("Starting tar")
                cat = Popen('find data -type f -print | sort | xargs cat', shell=True, stdout=PIPE)
                tar = Popen("/bin/tar -xzf - --directory '%s'" % destination, shell=True,
                            stdin=cat.stdout, stdout=PIPE, stderr=PIPE)

            #    Wait for the TAR to finish
            log('Waiting for tar to finish')

            stdout, stderr = tar.communicate()
            log("stderr=", stderr)
            log("stdout=", stdout)
            tar.wait()
            log("Main extraction complete")
            print("Errors:", stderr)


            ############
            #
            #    PROCESSING THE LOF FILE
            #
            ############
            if encrypted:
                log("starting lof processing")
                pass_file = os.path.join(tmp_dir, "pwd2")
                os.mkfifo(pass_file, 0600)

                lof_file = os.path.join(tmp_dir, "lof")

                openssl = Popen("/usr/bin/openssl enc -d -aes256 -md sha256 -pass 'file:%s' -in lof.enc -out '%s'" % (pass_file, lof_file),
                                shell=True, stdout=PIPE)
                #    Send the passphrase via a pipe file. 
                #    It wont appear in the process list (via cmd line arg).
                tmp_fd = open(pass_file, "w")
                tmp_fd.write(password)
                tmp_fd.close()
                openssl.wait()
                lof = GzipFile(lof_file, mode="rb")
            else:
                lof = GzipFile("lof", "r")
            try:
                log("Start LOF processing for Deletes")
                while True:
                    line = lof.readline()
                    if not line:
                        log("Done!")
                        break
                    #    Remove the \n
                    line = line[:-1]
                    log("line", line)
                    if line == "":
                        #    New folder (remember to strip the \n
                        folder = lof.readline()[:-1].decode("quopri_codec")
                        log("New folder", folder)
                        continue

                    parts = line.split(",")
                    log("Line parts:", parts)
                    name = parts[0].decode("quopri_codec")
                    type = parts[1]
                    if type != "X":
                        log("Not a delete")
                        continue

                    if folder[0] == os.sep:
                        folder = folder[1:]
                    path = os.path.join(destination, folder, name)
                    log("DELETE ", path)
                    if os.path.isdir(path):
                        shutil.rmtree(path)
                    else:
                        os.remove(path)
            finally:
                lof.close()
        except Exception as e:
            print("Got exception in recover:", str(e))
        finally:
            #    Remove the temp dir
            shutil.rmtree(tmp_dir)
            os.chdir(save_cwd)
示例#32
0
class BigTxtFile:
    def __init__(self, fp, inibuff=100, header_preffix=None, split=None):
        '''
BigTextFile(file_obj[, inibuff=100, header_preffix=None, split=None]) -> BigTextFile

fp - must be a file object (seek+tell methods)

inibuff - Initial buffer in lines to be read and detect CRlen

header_preffix - If provided inibuff will be discarded and all the lines
starting with header_preffix will be treated as a different part of the file
(header)

split - If provided, every line after header will be split by the provided
delimiter'''

        assert type(fp) in [str, file]
        if type(fp) == str:
            try:
                self.fp = GzipFile(fp)
                self.fp.readline()
                self.fp.seek(0)
            except:
                self.fp = open(fp)
        else:
            self.fp = fp

        self.header = []
        self.inibuff = []
        self.lenCR = 1
        self.split = split

        self.bodypos = 0

        if header_preffix:
            for i in self.fp:
                self.bodypos += len(i)
                if len(i.strip()) != len(i):
                    self.lenCR = len(i) - len(i.strip())
                if i.startswith(header_preffix):
                    self.header.append(i.strip())
                else:
                    self.bodypos -= len(i)
                    break

        else:
            for i in self.fp:
                self.inibuff.append(i)
                inibuff -= 1
                if inibuff < 1:
                    break

        if self.inibuff:
            self.lenCR = len(self.inibuff[0]) - len(self.inibuff[0].strip())

        self._initialise()

    def _initialise(self):
        '''Abstract method in case you want postprocess headers or indexing'''
        pass

    def _body(self):
        '''Generator for each line of the body (whole file if header not provided)'''
        self.fp.seek(self.bodypos)
        if self.split:
            for i in self.fp:
                yield i[:-self.lenCR].split(self.split)
        else:
            for i in self.fp:
                yield i[:-self.lenCR]

    def _header(self):
        '''Generator for the header records'''
        for i in self.header:
            yield i

    def __iter__(self):
        '''Generator for the whole file'''
        for i in itertools.chain(self._header(), self._body()):
            yield i
示例#33
0
import sys
import os
import codecs
from datetime import datetime, timedelta

output = codecs.open('kv7kalender.idx', 'w', 'UTF-8')

lastupdated_filenames = {}

kalender_filenames = [sys.argv[1] + '/' + x for x in os.listdir(sys.argv[1])]
kalender_threshold = (datetime.now() - timedelta(days=3)).isoformat()

for filename in sorted(kalender_filenames):
    f = GzipFile(filename, 'r')
    try:
        firstline = f.readline()[:-1]
        values = firstline.split('|')
        subscription = values[2]
        creationdate = values[7]
        if (creationdate < kalender_threshold):
            continue
        if subscription not in lastupdated_filenames:
            lastupdated_filenames[subscription] = {
                'filename': filename,
                'creationdate': creationdate
            }
        elif creationdate > lastupdated_filenames[subscription]['creationdate']:
            lastupdated_filenames[subscription] = {
                'filename': filename,
                'creationdate': creationdate
            }
示例#34
0
文件: filequeue.py 项目: travisfw/hq
class QueueFileReader(object):
    '''reads (dequeues) from single queue file'''
    def __init__(self, qfile, noupdate=False):
        self.fn = qfile
        self.noupdate = noupdate
        self.map = None
        self.open()
    def open(self):
        fd = os.open(self.fn, os.O_RDWR)
        self.map = mmap.mmap(fd, 0, access=mmap.ACCESS_WRITE)
        # mmap dups fd. fd need not be kept open.
        os.close(fd)
        self.pos = 0
        if self.map[0:2] == '\x1f\x8b':
            self.z = GzipFile(fileobj=self.map, mode='rb')
            self.__next = self.__next_gzip
        else:
            self.z = None
            self.__next = self.__next_mmap
    def close(self):
        if self.z:
            self.z.close()
            self.z = None
        if self.map:
            self.map.close()
            self.map = None
    def __next_mmap(self):
        while self.pos < self.map.size():
            el = self.map.find('\n', self.pos + 1)
            if el < 0: el = self.map.size()
            s = self.pos
            self.pos = el + 1
            if self.map[s] == ' ':
                l = self.map[s + 1:el]
                if not self.noupdate:
                    self.map[s] = '#'
                try:
                    return cjson.decode(l)
                except Exception as ex:
                    logging.warn('malformed line in %s at %d: %s', self.fn,
                                 s, l)
                    continue
        raise StopIteration

    def __next_gzip(self):
        while 1:
            try:
                l = self.z.readline()
            except IOError as ex:
                # probably CRC error due to truncated file. discard the rest.
                # should we keep the file for later diagnosis?
                logging.error('error in %s: %s', self.fn, str(ex))
                raise StopIteration
            if l == '': break
            if l[0] != ' ': continue
            try:
                return cjson.decode(l[1:])
            except Exception as ex:
                logging.warn('malformed line in %s: %s', self.fn, l)
                continue
        raise StopIteration
                             
    def next(self):
        if self.map is None:
            logging.warn("QueueFileReader:next called on closed file:%s",
                         self.fn)
            raise StopIteration
        return self.__next()
示例#35
0
if __name__ == '__main__':
    from sys import argv, stderr
    from gzip import GzipFile
    raw_vocab = argv[1]
    out_trans = argv[2]
    out_vocab = argv[3]
    fi = GzipFile(raw_vocab, "r")
    f = GzipFile(out_trans, "w")
    g = GzipFile(out_vocab, "w")
    addnewmapping("<*>", f, g)
    addnewmapping("<PP_UNK>", f, g)
    next_to_write = next_seen
    p = 0
    while True:
        v = fi.readline()
        if not v:
            break
        key, value = v.strip().split("\t")
        nkey = transformation(key)
        nkeyid = wordno(nkey)
        addmapping(key, nkeyid, f)
        if nkeyid >= next_to_write:
            print >> g, "%s\t%d" % (nkey, nkeyid)
            next_to_write = nkeyid + 1
        p += 1
        if (p % 100000) == 0:
            print >> stderr, "added", p, key, nkey, nkeyid
    fi.close()
    f.close()
    g.close()
示例#36
0
class LazyJsonReader(object):

    chunk_size = 2048
    """Newline-separated json log reader tolerating massive log files"""
    def __init__(self, file_path, file_gzipped=False):
        self.gz = file_gzipped
        self.file = GzipFile(file_path, 'rb') if file_gzipped else open(
            file_path, 'rb')
        """
        As bytes are read from the file a line count is kept. At any time we know:
        * the position of our pointer in the file's contents
        * what line number we are on
        So, it should be possible to fetch the previous/next line with some crafty seeking
        """
        self.line = 0

    def _get_position(self):
        """
        Return set of (current_line, current_file_position)
        """
        return (self.line, self.file.tell())

    def _seek_to(self, line, pos):
        """
        Seek to arbitrary locations. There's no logic here, this method assumes the line number and position specified
        are correct.
        """
        self.line = line
        self.file.seek(pos)

    def decode(self, s):
        return s.decode('UTF-8')

    def read_next(self):
        """
        Read the next line from the file, parse and return. Returns None if out of lines.
        """
        data = self.file.readline().strip()
        if data:
            self.line += 1
        return json.loads(self.decode(data)) if data else None

    def read_prev(self):
        """
        Read the previous line from the file, parse and return. Returns None if out of lines.
        """
        original_pos = current_pos = self.file.tell()

        # can't fall off the beginning
        if current_pos == 0:
            return None

        # rewind by chunk_size and read chunk_size bytes
        # repeat until we've found TWO \n - the end of the previous line, and the beginning of the line before the line we want
        # then split n grab
        #print(current_pos)
        rewound_chunk = b""
        while rewound_chunk.count(
                b"\n") < 3:  # changed from 2 to 3 to fix partial reads
            before_jump = current_pos

            # Jump backwards x bytes, and prevent falling off the start
            current_pos = max(0, current_pos - self.chunk_size)
            self.file.seek(current_pos)
            jumped_by = before_jump - current_pos

            # prepend the chunk to our buffer
            rewound_chunk = b''.join(
                [self.file.read(jumped_by), rewound_chunk])
            #rewound_chunk = ''.join([rewound_chunk, '|||||', self.file.read(jumped_by)])
            #print("Read ", jumped_by)

            # If we just read from the beginning of the file this loop should break regardless
            if current_pos == 0:
                break

        # we have a chunk containing at least one full line
        # find the last line in the chunk
        lines_split = rewound_chunk.split(b"\n")

        # -1 => blank
        # -2 => last line emitted
        # -3 => previous line. wont exist if we hit BOF
        # -4+ => line before that and/or partial line garbage
        if len(lines_split) < 3:
            self.line = 0
            self.file.seek(0)
            return json.loads(self.decode(lines_split[0]))
        prev_line = lines_split[-2]

        # Calculate how far backwards we jumped, seek to the beginning of the line we're returning
        # TODO should it be elsewhere so if next_line is called we dont get this line again?
        after_prev_line = lines_split[-1:]
        rewound_len = len(b"\n".join([prev_line] + after_prev_line))
        self.file.seek(original_pos - rewound_len)
        self.line -= 1

        return json.loads(self.decode(prev_line))
示例#37
0
    print 'You have PhosphoSitePlus datasets; they will be added to the database.'

print 'Finished fetching public data.'

####  UNIPROT

if 0:
    cur.execute("drop table if exists uniprot")
    cur.execute(
        "create table uniprot (u1 varchar(32) primary key, entry varchar(16), longname varchar(128), aaseq text, aalen int)"
    )
    fi = GzipFile('data/uniprot/uniprot_sprot.fasta.gz')
    idline = None
    seq = ''
    while 1:
        l = fi.readline()
        if not l:
            i = idline.strip().split('|')
            u1 = i[1]
            entry, longname = i[2].split(' ', 1)
            if entry.endswith('_HUMAN'):
                longname = longname.split(' OS=')[0].replace("'", '')
                cur.execute(
                    "insert into uniprot (u1, entry, longname, aaseq, aalen) values ('%s', '%s', '%s', '%s', %d)"
                    % (u1, entry, longname, seq, len(seq)))
            break
        if l.startswith('>'):
            if not idline:
                idline = l
                seq = ''
            i = idline.strip().split('|')
示例#38
0
 def readline_NOcheckEnd(self, size=-1):
     line = GzipFile.readline(self, size)
     return unicode(line, 'latin_1').encode('utf_8')
示例#39
0
if __name__ == '__main__':
    from sys import argv, stderr
    from gzip import GzipFile
    raw_vocab = argv[1]
    out_trans = argv[2]
    out_vocab = argv[3]
    fi = GzipFile( raw_vocab, "r" )
    f = GzipFile( out_trans, "w" )
    g = GzipFile( out_vocab, "w" )
    addnewmapping( "<*>", f, g )
    addnewmapping( "<PP_UNK>", f, g )
    next_to_write = next_seen
    p = 0
    while True:
        v = fi.readline()
        if not v:
            break
        key, value = v.strip().split( "\t" )
        nkey = transformation( key )
        nkeyid = wordno( nkey )
        addmapping( key, nkeyid, f )
        if nkeyid >= next_to_write:
            print >>g, "%s\t%d" % (nkey, nkeyid)
            next_to_write = nkeyid + 1
        p += 1
        if (p%100000) == 0:
            print >>stderr, "added", p, key, nkey, nkeyid
    fi.close()
    f.close()
    g.close()
示例#40
0
 def readline_NOcheckEnd(self, size=-1):
     line = GzipFile.readline(self, size)
     return unicode(line, 'latin_1').encode('utf_8')
示例#41
0
import sys
import os
import codecs
from datetime import datetime,timedelta

output = codecs.open('kv7kalender.idx', 'w', 'UTF-8')

lastupdated_filenames = {}

kalender_filenames = [sys.argv[1] + '/' + x for x in os.listdir(sys.argv[1])]
kalender_threshold = (datetime.now() - timedelta(days=3)).isoformat()

for filename in sorted(kalender_filenames):
    f = GzipFile(filename, 'r')
    try:
        firstline = f.readline()[:-1]
        values = firstline.split('|')
        subscription = values[2]
        creationdate = values[7]
        if (creationdate < kalender_threshold):
            continue
        if subscription not in lastupdated_filenames:
            lastupdated_filenames[subscription] = {'filename' : filename, 'creationdate' : creationdate}
        elif creationdate > lastupdated_filenames[subscription]['creationdate']:
            lastupdated_filenames[subscription] = {'filename' : filename, 'creationdate' : creationdate}
    finally:
        f.close()

for key, values in lastupdated_filenames.items():
        print key + ' - ' + values['filename']
        output.write(values['filename']+'\n')