Пример #1
0
def connect_index(reset=False):
    if reset:
        print 'deleting existing dict'
        elastic.delete_index()
    elastic.create_index()
    actions = {}

    def index_term(term):
        '''
        Look up definitions and synonyms of term,
        then returns their tokens for indexing further
        '''
        definitions, synonyms = nlp.get_definitions_synonyms(term)
        if not definitions:
            return []
        doc = {'term': term, 'definitions': definitions, 'synonyms': synonyms}
        actions[term] = {
            '_op_type': 'index',
            '_id': hash(term),
            '_index': elastic.SEARCH_INDEX,
            '_type': 'term',
            'doc': doc
        }
        actions_count = len(actions)
        if actions_count > 1000 and actions_count % 1000 == 0:
            retry_commit()
        return nlp.tokenize(nlp.join(*definitions + synonyms))

    def commit_index_actions():
        actionables = filter(None, actions.values())
        if not actionables:
            return False
        successes, errors = elastic.helpers.bulk(elastic.client, actionables)
        if errors:
            print errors
        print 'committed', successes, 'terms'
        print
        for term in actions:
            actions[term] = None
        return True

    def retry_commit():
        for timeout in (0, 10, 20, 30):
            try:
                return commit_index_actions()
            except elastic.exceptions.ConnectionTimeout as error:
                print 'errored', error, 'sleeping for', timeout
                time.sleep(timeout)

    try:
        yield index_term, actions.viewkeys()
    finally:
        if actions:
            retry_commit()
            elastic.refresh_index()
Пример #2
0
def upsert_cities(cities, reset=False):
    if reset:
        elastic.delete_index()
    elastic.create_index()
    actions = map(create_city_upsert, cities)
    elastic.helpers.bulk(
        elastic.client,
        actions,
        request_timeout=60)
    elastic.refresh_index()
    return True
Пример #3
0
def connect_index(reset=False):
    if reset:
        print 'deleting existing dict'
        elastic.delete_index()
    elastic.create_index()
    actions = {}
    def index_term(term):
        '''
        Look up definitions and synonyms of term,
        then returns their tokens for indexing further
        '''
        definitions, synonyms = nlp.get_definitions_synonyms(term)
        if not definitions:
            return []
        doc = {'term':term,
               'definitions':definitions,
               'synonyms':synonyms}
        actions[term] = {'_op_type':'index',
                         '_id':hash(term),
                         '_index':elastic.SEARCH_INDEX,
                         '_type':'term',
                         'doc':doc
                         }
        actions_count = len(actions)
        if actions_count > 1000 and actions_count % 1000 == 0:
            retry_commit()
        return nlp.tokenize(nlp.join(*definitions + synonyms))
    
    def commit_index_actions():
        actionables = filter(None, actions.values())
        if not actionables:
            return False
        successes, errors = elastic.helpers.bulk(elastic.client, actionables)
        if errors:
            print errors
        print 'committed', successes, 'terms'; print
        for term in actions:
            actions[term] = None
        return True
    
    def retry_commit():
        for timeout in (0,10,20,30):
            try:
                return commit_index_actions()
            except elastic.exceptions.ConnectionTimeout as error:
                print 'errored', error, 'sleeping for', timeout
                time.sleep(timeout)
    
    try:
        yield index_term, actions.viewkeys()
    finally:
        if actions:
            retry_commit()
            elastic.refresh_index()