Exemplo n.º 1
0
def update_index(units, source_units=None):
    '''
    Updates fulltext index for given set of units.
    '''
    languages = Language.objects.have_translation()

    # Default to same set for both updates
    if source_units is None:
        source_units = units

    # Update source index
    index = get_source_index()
    writer = BufferedWriter(index)
    try:
        for unit in source_units.iterator():
            update_source_unit_index(writer, unit)
    finally:
        writer.close()

    # Update per language indices
    for lang in languages:
        index = get_target_index(lang.code)
        writer = BufferedWriter(index)
        try:
            language_units = units.filter(
                translation__language=lang
            ).exclude(
                target=''
            )

            for unit in language_units.iterator():
                update_target_unit_index(writer, unit)
        finally:
            writer.close()
Exemplo n.º 2
0
def update_index(units, source_units=None):
    '''
    Updates fulltext index for given set of units.
    '''
    languages = Language.objects.have_translation()

    # Default to same set for both updates
    if source_units is None:
        source_units = units

    # Update source index
    if source_units.exists():
        index = get_source_index()
        writer = BufferedWriter(index)
        try:
            for unit in source_units.iterator():
                update_source_unit_index(writer, unit)
        finally:
            writer.close()

    # Update per language indices
    for lang in languages:
        language_units = units.filter(translation__language=lang).exclude(
            target='')

        if language_units.exists():
            index = get_target_index(lang.code)
            writer = BufferedWriter(index)
            try:

                for unit in language_units.iterator():
                    update_target_unit_index(writer, unit)
            finally:
                writer.close()
Exemplo n.º 3
0
def update_index(units):
    """Update fulltext index for given set of units."""
    languages = Language.objects.have_translation()

    # Update source index
    if units.exists():
        index = get_source_index()
        writer = BufferedWriter(index)
        try:
            for unit in units.iterator():
                update_source_unit_index(writer, unit)
        finally:
            writer.close()

    # Update per language indices
    for lang in languages:
        language_units = units.filter(
            translation__language=lang
        ).exclude(
            target=''
        )

        if language_units.exists():
            index = get_target_index(lang.code)
            writer = BufferedWriter(index)
            try:

                for unit in language_units.iterator():
                    update_target_unit_index(writer, unit)
            finally:
                writer.close()
Exemplo n.º 4
0
def update_index(units):
    """Update fulltext index for given set of units."""
    languages = Language.objects.have_translation()

    # Update source index
    if units.exists():
        index = get_source_index()
        writer = BufferedWriter(index)
        try:
            for unit in units.iterator():
                update_source_unit_index(writer, unit)
        finally:
            writer.close()

    # Update per language indices
    for lang in languages:
        language_units = units.filter(translation__language=lang).exclude(
            target='')

        if language_units.exists():
            index = get_target_index(lang.code)
            writer = BufferedWriter(index)
            try:

                for unit in language_units.iterator():
                    update_target_unit_index(writer, unit)
            finally:
                writer.close()
Exemplo n.º 5
0
    def handle(self, *args, **options):
        # Optionally rebuild indices from scratch
        if options['clean']:
            create_source_index()
            for lang in Language.objects.have_translation():
                create_target_index(lang=lang.code)

        # Open writer
        source_writer = BufferedWriter(get_source_index())
        target_writers = {}

        try:
            # Process all units
            for unit in self.iterate_units(*args, **options):
                lang = unit.translation.language.code
                # Lazy open writer
                if lang not in target_writers:
                    target_writers[lang] = BufferedWriter(
                        get_target_index(lang)
                    )
                # Update target index
                if unit.translation:
                    update_target_unit_index(target_writers[lang], unit)
                # Update source index
                update_source_unit_index(source_writer, unit)

        finally:
            # Close all writers
            source_writer.close()
            for lang in target_writers:
                target_writers[lang].close()
Exemplo n.º 6
0
    def handle(self, *args, **options):
        # Optionally rebuild indices from scratch
        if options['clean']:
            create_source_index()
            for lang in Language.objects.have_translation():
                create_target_index(lang=lang.code)

        # Open writer
        source_writer = BufferedWriter(get_source_index())
        target_writers = {}

        try:
            # Process all units
            for unit in self.iterate_units(*args, **options):
                lang = unit.translation.language.code
                # Lazy open writer
                if lang not in target_writers:
                    target_writers[lang] = BufferedWriter(
                        get_target_index(lang))
                # Update target index
                if unit.translation:
                    update_target_unit_index(target_writers[lang], unit)
                # Update source index
                update_source_unit_index(source_writer, unit)

        finally:
            # Close all writers
            source_writer.close()
            for lang in target_writers:
                target_writers[lang].close()
Exemplo n.º 7
0
def main():
    queue = HotQueue(main_config.INDEX_QUEUE, 
                     host=main_config.REDIS_HOST, 
                     port=main_config.REDIS_PORT)
    index = get_index(main_config.WHOOSH_INDEX_DIR)
    writer = BufferedWriter(index, limit=10)
    try:
        for doc_id in queue.consume():
            print "looking at {}".format(doc_id)
            doc = Document.query.get(doc_id)
            if doc:
                write_doc(doc, writer)
            else:
                print "no doc with doc_id {}".format(doc_id)
    finally:
       writer.close()
Exemplo n.º 8
0
def bootstrap_index(dirname=None, indexname=None):
    """
    Create spam index and add one post from the
    """
    if dirname and indexname:
        ix = search.init_index(dirname=dirname, indexname=indexname, schema=spam_schema())
    else:
        ix = init_spam_index()

    writer = BufferedWriter(ix)
    # Write text to index.
    index_writer(writer=writer, title="Placeholder",
                 content_length=0, is_spam=True,
                 content='CONTENT', uid=STARTER_UID)
    writer.commit()
    writer.close()

    return ix
def test_20000_buffered():
    from whoosh.writing import BufferedWriter

    sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
    with TempIndex(sc, "20000buffered") as ix:
        domain = ["alfa", "bravo", "charlie", "delta", "echo", "foxtrot",
                  "golf", "hotel", "india", "juliet", "kilo", "lima"]

        t = now()
        w = BufferedWriter(ix, limit=100, period=None)
        for i in xrange(20000):
            w.add_document(id=text_type(i),
                           text=u(" ").join(random.sample(domain, 5)))
        w.close()
        print("Write buffered:", now() - t)

        t = now()
        ix.optimize()
        print("Optimize buffered:", now() - t)
Exemplo n.º 10
0
def delete_search_units(source_units, languages):
    '''
    Delete fulltext index for given set of units.
    '''
    # Update source index
    if source_units:
        index = get_source_index()
        writer = BufferedWriter(index)
        try:
            for pk in source_units:
                writer.delete_by_term('pk', pk)
        finally:
            writer.close()

    for lang, units in languages.items():
        index = get_target_index(lang)
        writer = BufferedWriter(index)
        try:
            for pk in units:
                writer.delete_by_term('pk', pk)
        finally:
            writer.close()
Exemplo n.º 11
0
def test_classify(threshold=None,
                  niter=100,
                  limitmb=1024,
                  size=100,
                  verbosity=0):

    if threshold is None:
        threshold = settings.SPAM_THRESHOLD

    # Add posts to test spam index, then
    spam = Post.objects.filter(Q(spam=Post.SPAM) | Q(status=Post.DELETED))

    # Get the valid posts and shuffle.
    ham = Post.objects.valid_posts(author__profile__score__lte=0,
                                   type__in=[Post.ANSWER, Post.COMMENT])

    # Get list of id's for both
    spam = list(spam.values_list("id", flat=True))
    ham = list(ham.values_list("id", flat=True))

    # tp = Identify spam correctly.
    # tn = Identify valid post correctly.
    # fn = Missed to identify a spam.
    # fp = Mis-identified valid post as spam.
    tp, tn, fn, fp = 0, 0, 0, 0
    seen_ham, seen_spam = 0, 0
    elapsed, progress = util.timer_func()

    for i in range(niter):
        # Remove previous index
        if os.path.exists(TRAIN_DIR):
            shutil.rmtree(TRAIN_DIR)

        ix = search.init_index(
            dirname=TRAIN_DIR,
            indexname=f"train_{util.get_uuid(8)}_{settings.SPAM_INDEX_NAME}",
            schema=spam_schema())
        writer = BufferedWriter(ix,
                                limit=int((niter / 2) + 1),
                                writerargs=dict(limitmb=limitmb,
                                                multisegment=True))

        index_writer(writer=writer,
                     title="Placeholder",
                     content_length=0,
                     is_spam=True,
                     content='CONTENT',
                     uid=STARTER_UID)

        # Take one spam post out of training set.
        one_out = one_out_train(spam=spam, writer=writer, size=size, ham=ham)
        writer.commit()
        writer.close()
        post_score = compute_score(post=one_out, ix=ix)

        predicted_spam = post_score >= threshold
        is_spam = one_out.is_spam or one_out.is_deleted
        is_ham = not is_spam

        seen_spam += 1 if is_spam else 0
        seen_ham += 1 if is_ham else 0

        detail(is_spam=is_spam,
               predict=predicted_spam,
               post=one_out,
               verb=verbosity,
               post_score=post_score)

        if predicted_spam:
            tp += 1 if is_spam else 0
            fp += 1 if is_ham else 0

        else:
            fn += 1 if is_spam else 0
            tn += 1 if is_ham else 0

        progress(i, step=5, msg=f"iterations. tp={tp} fp={fp} tn={tn} fn={fn}")

    train_spam = sizer(spam, size)
    train_ham = sizer(ham, size)
    print(f"... {train_ham + train_spam}\tSize of index ( per iteration )")
    print(f"... \t{train_spam}\tSPAM")
    print(f"... \t{train_ham}\tHAM")
    print(f"\n... {niter}\tNumber of iterations")
    report(nham=seen_ham, nspam=seen_spam, tn=tn, tp=tp, fp=fp, fn=fn)

    return
Exemplo n.º 12
0
            if not type(cd) is unicode:
                cd = reader.unicodify(cd)
            writer.update_document(title=unicode(title),path=unicode(f),filename=unicode(title),content=cd)
            print 'commit'
            print 'closed'
            indexed = True
            ds.save_document(id,f,indexed=indexed,modified=modified,metadata=file_md)
        
        except Exception,e:
            print 'skipping',e
            return
            ds.save_document(id,f,indexed=indexed,modified=modified,error=e.message)


    index_path('/')
    writer.close()
    print 'committing'
    ds.update_indexed_time(id)


def _index_users():
    processes = {}
    while True:
        linked_users = database.DataStore().linked_users()
        for user in linked_users:
            indexed_time = user.get('indexed_time')
            if indexed_time and (  (datetime.datetime.utcnow() - indexed_time).total_seconds() < INDEX_INTERVAL  ):
                continue
            if user['_id'] in processes:
                continue
            
Exemplo n.º 13
0
 def delete(self, note_id):
     writer = BufferedWriter(self.index)
     writer.delete_by_term('note_id', note_id)
     writer.close()
Exemplo n.º 14
0
 def update(self, note):
     writer = BufferedWriter(self.index, period=10, limit=10)
     writer.update_document(note_id=note.id, notebook_id=note.notebook_id, title=note.title, snippet=note.snippet)
     writer.close()
Exemplo n.º 15
0
            key = f'{eid}:{locale}:tags'
            for tag in tags['values']:
                storage.lpush(key, tag)


if __name__ == '__main__':
    print('-' * 30)
    print('Muzeeglot data ingestion')
    print('-' * 30)
    if exists(configuration.INGESTION_LOCK):
        print('WARN: ingestion lock detected, pass')
    else:
        print('INFO: evaluate tags corpus')
        tags_corpus = get_tags_corpus()
        print('INFO: create search index')
        if not exists(configuration.INDEX):
            makedirs(configuration.INDEX)
        schema = Schema(ngram=NGRAMWORDS(), name=STORED(), eid=STORED())
        index = create_in(configuration.INDEX, schema)
        writer = BufferedWriter(index, period=60, limit=200)
        ingest_languages(writer)
        ingest_tags(tags_corpus)
        ingest_entities(tags_corpus, writer)
        print('INFO: optimize and close index')
        writer.close()
        index.optimize()
        index.close()
        print('INFO: write ingestion lock')
        with open(configuration.INGESTION_LOCK, 'w') as stream:
            stream.write('ingested')