Пример #1
0
def countRandomBitFrequencies(numTerms=100000, percentSparsity=0.01):
    """Create a uniformly random counts matrix through sampling."""
    # Accumulate counts by inplace-adding sparse matrices
    counts = SparseMatrix()
    size = 128 * 128
    counts.resize(1, size)

    # Pre-allocate buffer sparse matrix
    sparseBitmap = SparseMatrix()
    sparseBitmap.resize(1, size)

    random.seed(42)

    # Accumulate counts for each bit for each word
    numWords = 0
    for term in xrange(numTerms):
        bitmap = random.sample(xrange(size), int(size * percentSparsity))
        bitmap.sort()

        sparseBitmap.setRowFromSparse(0, bitmap, [1] * len(bitmap))
        counts += sparseBitmap
        numWords += 1

    # Compute normalized version of counts as a separate matrix
    frequencies = SparseMatrix()
    frequencies.resize(1, size)
    frequencies.copy(counts)
    frequencies.divide(float(numWords))

    # Wrap up by printing some statistics and then saving the normalized version
    printFrequencyStatistics(counts, frequencies, numWords, size)

    frequencyFilename = "bit_frequencies_random.pkl"
    print "Saving frequency matrix in", frequencyFilename
    with open(frequencyFilename, "wb") as frequencyPickleFile:
        pickle.dump(frequencies, frequencyPickleFile)

    return counts
def countRandomBitFrequencies(numTerms = 100000, percentSparsity = 0.01):
  """Create a uniformly random counts matrix through sampling."""
  # Accumulate counts by inplace-adding sparse matrices
  counts = SparseMatrix()
  size = 128*128
  counts.resize(1, size)

  # Pre-allocate buffer sparse matrix
  sparseBitmap = SparseMatrix()
  sparseBitmap.resize(1, size)

  random.seed(42)

  # Accumulate counts for each bit for each word
  numWords=0
  for term in xrange(numTerms):
    bitmap = random.sample(xrange(size), int(size*percentSparsity))
    bitmap.sort()

    sparseBitmap.setRowFromSparse(0, bitmap, [1]*len(bitmap))
    counts += sparseBitmap
    numWords += 1

  # Compute normalized version of counts as a separate matrix
  frequencies = SparseMatrix()
  frequencies.resize(1, size)
  frequencies.copy(counts)
  frequencies.divide(float(numWords))

  # Wrap up by printing some statistics and then saving the normalized version
  printFrequencyStatistics(counts, frequencies, numWords, size)

  frequencyFilename = "bit_frequencies_random.pkl"
  print "Saving frequency matrix in",frequencyFilename
  with open(frequencyFilename, "wb") as frequencyPickleFile:
    pickle.dump(frequencies, frequencyPickleFile)

  return counts
Пример #3
0
def countBitFrequenciesForTerms(client,
                                lines,
                                acceptanceProbability=0.1,
                                usePlaceholderEncoding=True,
                                percentSparsity=0.0102):
    # Accumulate counts by inplace-adding sparse matrices
    skippedWords = {}
    counts = SparseMatrix()
    width = RETINA_SIZES[client.retina]["width"]
    height = RETINA_SIZES[client.retina]["height"]
    counts.resize(1, width * height)

    # Pre-allocate buffer sparse matrix
    sparseBitmap = SparseMatrix()
    sparseBitmap.resize(1, width * height)

    # Accumulate counts for each bit for each word
    numWords = 0
    numLines = 0
    for line in lines:
        tokens = TextPreprocess().tokenize(line)
        for term in tokens:

            p = random.uniform(0, 1)
            if p <= acceptanceProbability:
                if usePlaceholderEncoding:
                    random.seed(term)
                    bitmap = random.sample(
                        xrange(width * height),
                        int(width * height * percentSparsity))
                    bitmap.sort()
                    random.seed(p)
                else:
                    try:
                        bitmap = client.getBitmap(
                            term)["fingerprint"]["positions"]
                    except Exception as err:
                        print "Skipping '{}', reason: {}".format(
                            term, str(err))
                        continue

                    if not bitmap:
                        skippedWords[term] = skippedWords.get(term, 0) + 1
                        # print "Skipping '{}', reason: empty".format(term)
                        continue

                sparseBitmap.setRowFromSparse(0, bitmap, [1] * len(bitmap))
                counts += sparseBitmap
                numWords += 1

        numLines += 1
        if numLines % 1000 == 0:
            print "...processed=", numLines, "lines and", numWords, "words"

    # Compute normalized version of counts as a separate matrix
    frequencies = SparseMatrix()
    frequencies.resize(1, width * height)
    frequencies.copy(counts)
    frequencies.divide(float(numWords))

    # Wrap up by printing some statistics and then saving the normalized version
    print "Processed", numLines, "lines"
    printFrequencyStatistics(counts, frequencies, numWords, width * height)

    frequencyFilename = "bit_frequencies_" + client.retina + ".pkl"
    print "Saving frequency matrix in", frequencyFilename
    with open(frequencyFilename, "wb") as frequencyPickleFile:
        pickle.dump(frequencies, frequencyPickleFile)

    print "These words were skipped N times because of empty bitmap result"
    print skippedWords

    return counts
def countBitFrequenciesForTerms(client, lines,
                                acceptanceProbability = 0.1,
                                usePlaceholderEncoding = True,
                                percentSparsity = 0.0102):
  # Accumulate counts by inplace-adding sparse matrices
  skippedWords = {}
  counts = SparseMatrix()
  width = RETINA_SIZES[client.retina]["width"]
  height = RETINA_SIZES[client.retina]["height"]
  counts.resize(1, width*height)

  # Pre-allocate buffer sparse matrix
  sparseBitmap = SparseMatrix()
  sparseBitmap.resize(1, width*height)

  # Accumulate counts for each bit for each word
  numWords=0
  numLines=0
  for line in lines:
    tokens = TextPreprocess().tokenize(line)
    for term in tokens:

      p = random.uniform(0,1)
      if p <= acceptanceProbability:
        if usePlaceholderEncoding:
          random.seed(term)
          bitmap = random.sample(xrange(width*height),
                                 int(width*height*percentSparsity))
          bitmap.sort()
          random.seed(p)
        else:
          try:
            bitmap = client.getBitmap(term)["fingerprint"]["positions"]
          except Exception as err:
            print "Skipping '{}', reason: {}".format(term, str(err))
            continue

          if not bitmap:
            skippedWords[term] = skippedWords.get(term,0)+1
            # print "Skipping '{}', reason: empty".format(term)
            continue

        sparseBitmap.setRowFromSparse(0, bitmap, [1]*len(bitmap))
        counts += sparseBitmap
        numWords += 1

    numLines += 1
    if numLines%1000==0:
      print "...processed=",numLines,"lines and",numWords,"words"

  # Compute normalized version of counts as a separate matrix
  frequencies = SparseMatrix()
  frequencies.resize(1, width*height)
  frequencies.copy(counts)
  frequencies.divide(float(numWords))

  # Wrap up by printing some statistics and then saving the normalized version
  print "Processed",numLines,"lines"
  printFrequencyStatistics(counts, frequencies, numWords, width*height)

  frequencyFilename = "bit_frequencies_"+client.retina+".pkl"
  print "Saving frequency matrix in",frequencyFilename
  with open(frequencyFilename, "wb") as frequencyPickleFile:
    pickle.dump(frequencies, frequencyPickleFile)

  print "These words were skipped N times because of empty bitmap result"
  print skippedWords

  return counts