Exemplo n.º 1
0
def idxmerge(cdbname, idxstomerge, verbose=0):
  # Count all the unique locations and assign new document ids.
  idxorder = {}
  loc2docid = {}
  for (i,idx) in enumerate(reversed(idxstomerge)):
    idx.assignnewids1(loc2docid)
    idxorder[idx] = i
  n = len(loc2docid)
  loc2docid = dict( (loc,n-docid) for (loc,docid) in loc2docid.iteritems() )
  for idx in idxstomerge:
    idx.assignnewids2(loc2docid)
  # Create a new index file.
  maker = cdb.cdbmake(cdbname, cdbname+'.tmp')
  if verbose:
    print >>sys.stderr, 'Merging: %r (docs=%d, est. terms=%d): %r' % \
          (cdbname, sum( idx.ndocs for idx in idxstomerge ),
           estimate_terms( idx.nterms for idx in idxstomerge ), idxstomerge)
  # Copy sentences to a new index file with unique ids.
  for idx in idxstomerge:
    idx.copysents(maker)
  # Merge document ids and offsets.
  nterms = 0
  docid2info = []
  for (k,vs) in cdbmerge(idxstomerge):
    if k[0] == PROP_LOC or k[0] == PROP_IDXINFO: break
    if k[0] == PROP_DOCID: 
      # read a docid->loc mapping
      (oldid,) = unpack('>xi', k)
      for (info,idx) in vs:
        if oldid not in idx.old2new: continue
        newid = idx.old2new[oldid]
        docid2info.append((newid, info))
        assert loc2docid[info[4:]] == newid
    else:
      # merge docid+pos sets
      vs = sorted(( (idxorder[idx], idx.convertoldids(v)) for (v,idx) in vs ))
      ents = sum( len(a) for (_,a) in vs )/2
      (_,r) = vs.pop(0)
      for (_,a) in vs:
        r.extend(a)
      maker.add(k, encode_array(ents, r))
      nterms += 1
      if verbose and nterms % 1000 == 0:
        sys.stderr.write('.'); sys.stderr.flush()

  # write docid->loc mappings (avoiding dupes)
  docid2info.sort()
  for (docid,info) in docid2info:
    maker.add(pack('>ci', PROP_DOCID, docid), info)
  # write loc->docid mappings (avoiding dupes)
  for (loc,docid) in sorted(loc2docid.iteritems()):
    if loc:
      maker.add(PROP_LOC+loc, pack('>i', docid))

  if verbose:
    print >>sys.stderr, 'done: docs=%d, terms=%d' % (len(docid2info), nterms)
  maker.add(PROP_IDXINFO, pack('>ii', len(docid2info), nterms))
  maker.finish()
  return
Exemplo n.º 2
0
def idxmerge(cdbname, idxstomerge, verbose=0):
    # Count all the unique locations and assign new document ids.
    idxorder = {}
    loc2docid = {}
    for (i, idx) in enumerate(reversed(idxstomerge)):
        idx.assignnewids1(loc2docid)
        idxorder[idx] = i
    n = len(loc2docid)
    loc2docid = dict(
        (loc, n - docid) for (loc, docid) in loc2docid.iteritems())
    for idx in idxstomerge:
        idx.assignnewids2(loc2docid)
    # Create a new index file.
    maker = cdb.cdbmake(cdbname, cdbname + '.tmp')
    if verbose:
        print >>sys.stderr, 'Merging: %r (docs=%d, est. terms=%d): %r' % \
              (cdbname, sum( idx.ndocs for idx in idxstomerge ),
               estimate_terms( idx.nterms for idx in idxstomerge ), idxstomerge)
    # Copy sentences to a new index file with unique ids.
    for idx in idxstomerge:
        idx.copysents(maker)
    # Merge document ids and offsets.
    nterms = 0
    docid2info = []
    for (k, vs) in cdbmerge(idxstomerge):
        if k[0] == PROP_LOC or k[0] == PROP_IDXINFO: break
        if k[0] == PROP_DOCID:
            # read a docid->loc mapping
            (oldid, ) = unpack('>xi', k)
            for (info, idx) in vs:
                if oldid not in idx.old2new: continue
                newid = idx.old2new[oldid]
                docid2info.append((newid, info))
                assert loc2docid[info[4:]] == newid
        else:
            # merge docid+pos sets
            vs = sorted(
                ((idxorder[idx], idx.convertoldids(v)) for (v, idx) in vs))
            ents = sum(len(a) for (_, a) in vs) / 2
            (_, r) = vs.pop(0)
            for (_, a) in vs:
                r.extend(a)
            maker.add(k, encode_array(ents, r))
            nterms += 1
            if verbose and nterms % 1000 == 0:
                sys.stderr.write('.')
                sys.stderr.flush()

    # write docid->loc mappings (avoiding dupes)
    docid2info.sort()
    for (docid, info) in docid2info:
        maker.add(pack('>ci', PROP_DOCID, docid), info)
    # write loc->docid mappings (avoiding dupes)
    for (loc, docid) in sorted(loc2docid.iteritems()):
        if loc:
            maker.add(PROP_LOC + loc, pack('>i', docid))

    if verbose:
        print >> sys.stderr, 'done: docs=%d, terms=%d' % (len(docid2info),
                                                          nterms)
    maker.add(PROP_IDXINFO, pack('>ii', len(docid2info), nterms))
    maker.finish()
    return
Exemplo n.º 3
0
 def add_idx(self, idxid):
   fname = self.gen_idx_fname(idxid)
   maker = cdb.cdbmake(fname, fname+'.tmp')
   return (fname, maker)
Exemplo n.º 4
0
 def add_idx(self, idxid):
     fname = self.gen_idx_fname(idxid)
     maker = cdb.cdbmake(fname, fname + '.tmp')
     return (fname, maker)