def __enter__(self): socket = factory.createSocket(self.host, self.port) self.transport = factory.createTransport(socket) protocol = factory.createProtocol(self.transport) cli = SearchService.Client(protocol) self.transport.open() return cli
def __enter__(self): socket = factory.createSocket(self.host, self.port) self.transport = factory.createTransport(socket) protocol = factory.createProtocol(self.transport) cli = Search.Client(protocol) self.transport.open() return cli
def write_communication_to_buffer(comm): ''' Serialize communication to buffer (binary string) and return buffer. ''' transport = TMemoryBuffer() protocol = factory.createProtocol(transport) comm.write(protocol) return transport.getvalue()
def read_from_buffer(buf, name = None): transport_in = TMemoryBuffer(buf) protocol_in = factory.createProtocol(transport_in) obj = minsky.EDoc() try: obj.read(protocol_in) except Exception as e: import sys sys.stderr.write("%s Error reading buffer %s\n" % (("With key " + name) if not name is None else "", str(buf))) raise e return obj
def read_communication_from_buffer(buf, add_references=True): ''' Deserialize buf (a binary string) and return resulting communication. Add references if requested. ''' transport_in = TMemoryBuffer(buf) protocol_in = factory.createProtocol(transport_in) comm = Communication() comm.read(protocol_in) if add_references: add_references_to_communication(comm) return comm
def get(self, communication_ids): """ Args: - `communication_ids`: List of CommunicationID strings Returns: - List of Communications """ socket = thrift_factory.createSocket(self.host, int(self.port)) transport = thrift_factory.createTransport(socket) protocol = thrift_factory.createProtocol(transport) client = FetchCommunicationService.Client(protocol) transport.open() fetch_request = FetchRequest() fetch_request.communicationIds = communication_ids fetch_result = client.fetch(fetch_request) return fetch_result.communications
def __init__(self, filename, add_references=True, filetype=FileType.AUTO): filetype = FileType.lookup(filetype) self._add_references = add_references self._source_filename = filename if filetype == FileType.TAR: self.filetype = 'tar' self.tar = tarfile.open(filename, 'r|') elif filetype == FileType.TAR_GZ: self.filetype = 'tar' self.tar = tarfile.open(filename, 'r|gz') elif filetype == FileType.TAR_BZ2: self.filetype = 'tar' self.tar = tarfile.open(filename, 'r|bz2') elif filetype == FileType.ZIP: self.filetype = 'zip' self.zip = zipfile.ZipFile(filename, 'r') self.zip_infolist = self.zip.infolist() self.zip_infolist_index = 0 elif filetype == FileType.STREAM: self.filetype = 'stream' f = open(filename, 'rb') elif filetype == FileType.STREAM_GZ: self.filetype = 'stream' f = gzip.open(filename, 'rb') elif filetype == FileType.STREAM_BZ2: self.filetype = 'stream' f = bz2.BZ2File(filename, 'r') elif filetype == FileType.AUTO: if tarfile.is_tarfile(filename): self.filetype = 'tar' self.tar = tarfile.open(filename, 'r|*') elif zipfile.is_zipfile(filename): self.filetype = 'zip' self.zip = zipfile.ZipFile(filename, 'r') self.zip_infolist = self.zip.infolist() self.zip_infolist_index = 0 elif mimetypes.guess_type(filename)[1] == 'gzip': # this is not a true stream---is_tarfile will have # successfully seeked backwards on the file if we have # reached this point self.filetype = 'stream' f = gzip.open(filename, 'rb') elif mimetypes.guess_type(filename)[1] == 'bzip2': # this is not a true stream self.filetype = 'stream' f = bz2.BZ2File(filename, 'r') else: # this is not a true stream self.filetype = 'stream' f = open(filename, 'rb') else: raise ValueError('unknown filetype %d' % filetype) if self.filetype is 'stream': self.transport = TTransport.TFileObjectTransport(f) self.protocol = factory.createProtocol(self.transport) self.transport.open()
def __init__(self, thrift_type, filename, postprocess=None, filetype=FileType.AUTO): filetype = FileType.lookup(filetype) self._thrift_type = thrift_type if postprocess is None: def _noop(obj): return self._postprocess = _noop else: self._postprocess = postprocess self._source_filename = filename if filetype == FileType.TAR: self.filetype = 'tar' self.tar = tarfile.open(filename, 'r|') elif filetype == FileType.TAR_GZ: self.filetype = 'tar' self.tar = tarfile.open(filename, 'r|gz') elif filetype == FileType.TAR_BZ2: self.filetype = 'tar' self.tar = tarfile.open(filename, 'r|bz2') elif filetype == FileType.ZIP: self.filetype = 'zip' self.zip = zipfile.ZipFile(filename, 'r') self.zip_infolist = self.zip.infolist() self.zip_infolist_index = 0 elif filetype == FileType.STREAM: self.filetype = 'stream' f = open(filename, 'rb') elif filetype == FileType.STREAM_GZ: self.filetype = 'stream' f = gzip.open(filename, 'rb') elif filetype == FileType.STREAM_BZ2: self.filetype = 'stream' f = bz2.BZ2File(filename, 'r') elif filetype == FileType.AUTO: if tarfile.is_tarfile(filename): self.filetype = 'tar' self.tar = tarfile.open(filename, 'r|*') elif zipfile.is_zipfile(filename): self.filetype = 'zip' self.zip = zipfile.ZipFile(filename, 'r') self.zip_infolist = self.zip.infolist() self.zip_infolist_index = 0 elif mimetypes.guess_type(filename)[1] == 'gzip': # this is not a true stream---is_tarfile will have # successfully seeked backwards on the file if we have # reached this point self.filetype = 'stream' f = gzip.open(filename, 'rb') elif mimetypes.guess_type(filename)[1] == 'bzip2': # this is not a true stream self.filetype = 'stream' f = bz2.BZ2File(filename, 'r') else: # this is not a true stream self.filetype = 'stream' f = open(filename, 'rb') else: raise ValueError('unknown filetype %d' % filetype) if self.filetype is 'stream': self.transport = TTransport.TFileObjectTransport(f) self.protocol = factory.createProtocol(self.transport) self.transport.open()
def process_input(func): num_seen = 0 if args.from_list: lst = [] if args.field is None: print "Please provide a --field argument when joining from a list" else: lst = [(name, x) for name in args.name for x in rdb.lrange(name, 0, min(rdb.llen(name), args.N)) ] def computed_idf(): N = 0 idf = collections.defaultdict(float) print "computing IDF weights...." for (name, d) in lst: dd = args.name_prefix + d obuff = rdb.get(dd) if args.field is None else rdb.hget(dd, args.field) if obuff is None: continue obj = read_from_buffer(obuff, args.name_prefix + d) N += 1 if obj.entities is None: continue words = set([s.predicate.word for entity in obj.entities for mention in entity.mentions for s in mention.structures if s.annot_level == annotation_level]) for w in words: idf[w] += 1 print "done computing idf weights" def curried(w): return math.log(N / (idf[w] if w in idf else 1E-8)) #print [(i, curried(i)) for i in idf.keys()] return curried idf_func = computed_idf() for (di, (name, d)) in enumerate(lst): dd = args.name_prefix + d obuff = rdb.get(dd) if args.field is None else rdb.hget(dd, args.field) if obuff is None: continue obj = read_from_buffer(obuff, args.name_prefix + d) #print "Processing %s" % obj.id pnuments = 0 if obj.entities is None else len(obj.entities) func(obj, out_writer, annotation_level, \ idf_func, args.threshold, \ vocabs) if obj.entities is None or len(obj.entities) == 0: print "EDoc %s no longer has any entities; removing from corpus" % (obj.id) continue if (di + 1) % 100 == 0: print "Saving the %dth document" % (di + 1) print "Saving EDoc %s with %d entities (%d removed)" % (obj.id, len(obj.entities), pnuments - len(obj.entities)) transport_out = TMemoryBuffer() protocol_out = factory.createProtocol(transport_out) obj.write(protocol_out) suffix = ":idf" + str(args.threshold) if args.field is None: rdb.set(args.name_prefix + d + suffix, transport_out.getvalue()) else: rdb.hset(dd, args.field + suffix, transport_out.getvalue()) rdb.rpush(name + suffix, d) num_seen += 1 else: print "ERROR: Not available" exit(1) for x in args.name: if args.N >= 0 and num_seen >= args.N: break key_list = rdb.keys(x) if args.random: random.shuffle(key_list) for y in key_list: if num_seen >= args.N: break ty = rdb.type(y) #print y, ty if ty == 'string' or \ (ty == 'hash' and not args.field is None): if num_seen >= args.N: break obuff = rdb.get(y) if ty == 'string' else rdb.hget(y, args.field) if obuff is None: continue obj = read_from_buffer( obuff, args.which, y ) #print "Processing %s" % obj.id func(obj, args, out_writer, vocabs) num_seen += 1 else: rr = concrete.util.RedisReader(rdb, y, key_type = ty) for obj_buf in rr: if num_seen >= args.N: break obj = read_from_buffer(obj_buf, args.which, y) func(obj, args, out_writer, vocabs) num_seen += 1