示例#1
0
    def __enter__(self):
        socket = factory.createSocket(self.host, self.port)
        self.transport = factory.createTransport(socket)
        protocol = factory.createProtocol(self.transport)

        cli = SearchService.Client(protocol)

        self.transport.open()
        return cli
    def __enter__(self):
        socket = factory.createSocket(self.host, self.port)
        self.transport = factory.createTransport(socket)
        protocol = factory.createProtocol(self.transport)

        cli = Search.Client(protocol)

        self.transport.open()
        return cli
示例#3
0
def write_communication_to_buffer(comm):
    '''
    Serialize communication to buffer (binary string) and return
    buffer.
    '''
    transport = TMemoryBuffer()
    protocol = factory.createProtocol(transport)
    comm.write(protocol)
    return transport.getvalue()
示例#4
0
def write_communication_to_buffer(comm):
    '''
    Serialize communication to buffer (binary string) and return
    buffer.
    '''
    transport = TMemoryBuffer()
    protocol = factory.createProtocol(transport)
    comm.write(protocol)
    return transport.getvalue()
def read_from_buffer(buf, name = None):
    transport_in = TMemoryBuffer(buf)
    protocol_in = factory.createProtocol(transport_in)
    obj = minsky.EDoc()
    try:
        obj.read(protocol_in)
    except Exception as e:
        import sys
        sys.stderr.write("%s Error reading buffer %s\n" % (("With key " + name) if not name is None else "", str(buf)))
        raise e
    return obj
示例#6
0
def read_communication_from_buffer(buf, add_references=True):
    '''
    Deserialize buf (a binary string) and return resulting
    communication.  Add references if requested.
    '''
    transport_in = TMemoryBuffer(buf)
    protocol_in = factory.createProtocol(transport_in)
    comm = Communication()
    comm.read(protocol_in)
    if add_references:
        add_references_to_communication(comm)
    return comm
示例#7
0
def read_communication_from_buffer(buf, add_references=True):
    '''
    Deserialize buf (a binary string) and return resulting
    communication.  Add references if requested.
    '''
    transport_in = TMemoryBuffer(buf)
    protocol_in = factory.createProtocol(transport_in)
    comm = Communication()
    comm.read(protocol_in)
    if add_references:
        add_references_to_communication(comm)
    return comm
示例#8
0
    def get(self, communication_ids):
        """
        Args:
        - `communication_ids`: List of CommunicationID strings

        Returns:
        - List of Communications
        """
        socket = thrift_factory.createSocket(self.host, int(self.port))
        transport = thrift_factory.createTransport(socket)
        protocol = thrift_factory.createProtocol(transport)
        client = FetchCommunicationService.Client(protocol)
        transport.open()

        fetch_request = FetchRequest()
        fetch_request.communicationIds = communication_ids
        fetch_result = client.fetch(fetch_request)
        return fetch_result.communications
示例#9
0
    def __init__(self, filename, add_references=True, filetype=FileType.AUTO):
        filetype = FileType.lookup(filetype)

        self._add_references = add_references
        self._source_filename = filename

        if filetype == FileType.TAR:
            self.filetype = 'tar'
            self.tar = tarfile.open(filename, 'r|')

        elif filetype == FileType.TAR_GZ:
            self.filetype = 'tar'
            self.tar = tarfile.open(filename, 'r|gz')

        elif filetype == FileType.TAR_BZ2:
            self.filetype = 'tar'
            self.tar = tarfile.open(filename, 'r|bz2')

        elif filetype == FileType.ZIP:
            self.filetype = 'zip'
            self.zip = zipfile.ZipFile(filename, 'r')
            self.zip_infolist = self.zip.infolist()
            self.zip_infolist_index = 0

        elif filetype == FileType.STREAM:
            self.filetype = 'stream'
            f = open(filename, 'rb')

        elif filetype == FileType.STREAM_GZ:
            self.filetype = 'stream'
            f = gzip.open(filename, 'rb')

        elif filetype == FileType.STREAM_BZ2:
            self.filetype = 'stream'
            f = bz2.BZ2File(filename, 'r')

        elif filetype == FileType.AUTO:
            if tarfile.is_tarfile(filename):
                self.filetype = 'tar'
                self.tar = tarfile.open(filename, 'r|*')

            elif zipfile.is_zipfile(filename):
                self.filetype = 'zip'
                self.zip = zipfile.ZipFile(filename, 'r')
                self.zip_infolist = self.zip.infolist()
                self.zip_infolist_index = 0

            elif mimetypes.guess_type(filename)[1] == 'gzip':
                # this is not a true stream---is_tarfile will have
                # successfully seeked backwards on the file if we have
                # reached this point
                self.filetype = 'stream'
                f = gzip.open(filename, 'rb')

            elif mimetypes.guess_type(filename)[1] == 'bzip2':
                # this is not a true stream
                self.filetype = 'stream'
                f = bz2.BZ2File(filename, 'r')

            else:
                # this is not a true stream
                self.filetype = 'stream'
                f = open(filename, 'rb')

        else:
            raise ValueError('unknown filetype %d' % filetype)

        if self.filetype is 'stream':
            self.transport = TTransport.TFileObjectTransport(f)
            self.protocol = factory.createProtocol(self.transport)
            self.transport.open()
示例#10
0
    def __init__(self, thrift_type, filename,
                 postprocess=None, filetype=FileType.AUTO):
        filetype = FileType.lookup(filetype)

        self._thrift_type = thrift_type
        if postprocess is None:
            def _noop(obj):
                return
            self._postprocess = _noop
        else:
            self._postprocess = postprocess
        self._source_filename = filename

        if filetype == FileType.TAR:
            self.filetype = 'tar'
            self.tar = tarfile.open(filename, 'r|')

        elif filetype == FileType.TAR_GZ:
            self.filetype = 'tar'
            self.tar = tarfile.open(filename, 'r|gz')

        elif filetype == FileType.TAR_BZ2:
            self.filetype = 'tar'
            self.tar = tarfile.open(filename, 'r|bz2')

        elif filetype == FileType.ZIP:
            self.filetype = 'zip'
            self.zip = zipfile.ZipFile(filename, 'r')
            self.zip_infolist = self.zip.infolist()
            self.zip_infolist_index = 0

        elif filetype == FileType.STREAM:
            self.filetype = 'stream'
            f = open(filename, 'rb')

        elif filetype == FileType.STREAM_GZ:
            self.filetype = 'stream'
            f = gzip.open(filename, 'rb')

        elif filetype == FileType.STREAM_BZ2:
            self.filetype = 'stream'
            f = bz2.BZ2File(filename, 'r')

        elif filetype == FileType.AUTO:
            if tarfile.is_tarfile(filename):
                self.filetype = 'tar'
                self.tar = tarfile.open(filename, 'r|*')

            elif zipfile.is_zipfile(filename):
                self.filetype = 'zip'
                self.zip = zipfile.ZipFile(filename, 'r')
                self.zip_infolist = self.zip.infolist()
                self.zip_infolist_index = 0

            elif mimetypes.guess_type(filename)[1] == 'gzip':
                # this is not a true stream---is_tarfile will have
                # successfully seeked backwards on the file if we have
                # reached this point
                self.filetype = 'stream'
                f = gzip.open(filename, 'rb')

            elif mimetypes.guess_type(filename)[1] == 'bzip2':
                # this is not a true stream
                self.filetype = 'stream'
                f = bz2.BZ2File(filename, 'r')

            else:
                # this is not a true stream
                self.filetype = 'stream'
                f = open(filename, 'rb')

        else:
            raise ValueError('unknown filetype %d' % filetype)

        if self.filetype is 'stream':
            self.transport = TTransport.TFileObjectTransport(f)
            self.protocol = factory.createProtocol(self.transport)
            self.transport.open()
def process_input(func):
    num_seen = 0
    if args.from_list:
        lst = []
        if args.field is None:
          print "Please provide a --field argument when joining from a list"
        else:
          lst = [(name, x) for name in args.name for x in rdb.lrange(name, 0, min(rdb.llen(name), args.N)) ]

        def computed_idf():
          N = 0
          idf = collections.defaultdict(float)
          print "computing IDF weights...."
          for (name, d) in lst:
            dd = args.name_prefix + d
            obuff = rdb.get(dd) if args.field is None else rdb.hget(dd, args.field)
            if obuff is None:
              continue
            obj = read_from_buffer(obuff, args.name_prefix + d)
            N += 1
            if obj.entities is None:
              continue
            words = set([s.predicate.word for entity in obj.entities for mention in entity.mentions for s in mention.structures if s.annot_level == annotation_level])
            for w in words:
              idf[w] += 1
          print "done computing idf weights"
          def curried(w):
            return math.log(N / (idf[w] if w in idf else 1E-8))
          #print [(i, curried(i)) for i in idf.keys()]
          return curried
        idf_func = computed_idf()
        for (di, (name, d)) in enumerate(lst):
          dd = args.name_prefix + d
          obuff = rdb.get(dd) if args.field is None else rdb.hget(dd, args.field)
          if obuff is None:
            continue
          obj = read_from_buffer(obuff, args.name_prefix + d)
          #print "Processing %s" % obj.id
          pnuments = 0 if obj.entities is None else len(obj.entities)
          func(obj, out_writer, annotation_level, \
               idf_func, args.threshold, \
               vocabs)
          if obj.entities is None or len(obj.entities) == 0:
            print "EDoc %s no longer has any entities; removing from corpus" % (obj.id)
            continue
          if (di + 1) % 100 == 0:
            print "Saving the %dth document" % (di + 1)
          print "Saving EDoc %s with %d entities (%d removed)" % (obj.id, len(obj.entities), pnuments - len(obj.entities))
          transport_out = TMemoryBuffer()
          protocol_out = factory.createProtocol(transport_out)
          obj.write(protocol_out)
          suffix = ":idf" + str(args.threshold)
          if args.field is None:
            rdb.set(args.name_prefix + d + suffix,
                    transport_out.getvalue())
          else:
            rdb.hset(dd, args.field + suffix, transport_out.getvalue())
            rdb.rpush(name + suffix, d)
          num_seen += 1
    else:
        print "ERROR: Not available"
        exit(1)
        for x in args.name:
            if args.N >= 0 and num_seen >= args.N:
                break
            key_list = rdb.keys(x)
            if args.random:
                random.shuffle(key_list)
            for y in key_list:
                if num_seen >= args.N:
                    break
                ty = rdb.type(y)
                #print y, ty
                if ty == 'string' or \
                   (ty == 'hash' and not args.field is None):
                    if num_seen >= args.N:
                        break
                    obuff = rdb.get(y) if ty == 'string' else rdb.hget(y, args.field)
                    if obuff is None:
                        continue
                    obj = read_from_buffer( obuff, args.which, y )
                    #print "Processing %s" % obj.id
                    func(obj, args, out_writer, vocabs)
                    num_seen += 1
                else:
                    rr = concrete.util.RedisReader(rdb, y, key_type = ty)
                    for obj_buf in rr:
                        if num_seen >= args.N:
                            break
                        obj = read_from_buffer(obj_buf, args.which, y)
                        func(obj, args, out_writer, vocabs)
                        num_seen += 1