Пример #1
0
    def scroll(cls,
               q=None,
               page_size=1000,
               limit=None,
               keepalive="1m",
               conn=None,
               raise_on_scroll_error=True,
               types=None,
               wrap=True):
        if conn is None:
            conn = cls.__conn__
        types = cls.get_read_types(types)

        if q is None:
            q = {"query": {"match_all": {}}}

        gen = tasks.scroll(conn,
                           types,
                           q,
                           page_size=page_size,
                           limit=limit,
                           keepalive=keepalive)

        try:
            for o in gen:
                if wrap:
                    yield cls(o)
                else:
                    yield o
        except tasks.ScrollException as e:
            if raise_on_scroll_error:
                raise e
            else:
                return
Пример #2
0
    def scroll(cls, q=None, page_size=1000, limit=None, keepalive="1m", conn=None, raise_on_scroll_error=True, types=None, wrap=True):
        if conn is None:
            conn = cls.__conn__
        types = cls.get_read_types(types)

        if q is None:
            q = {"query": {"match_all": {}}}

        gen = tasks.scroll(conn, types, q, page_size=page_size, limit=limit, keepalive=keepalive)

        try:
            for o in gen:
                if wrap:
                    yield cls(o)
                else:
                    yield o
        except tasks.ScrollException as e:
            if raise_on_scroll_error:
                raise e
            else:
                return
Пример #3
0
def scroll_edit(connection, es_type, query):
    """ Use a scroll search to update a field based on a given query """
    write_batch = []
    batch_size = 200

    for a in tasks.scroll(connection, type=es_type, q=query):

        # Substitute the text and add to the write batch
        d = match_hindawi_urls.sub(replacement_text, json.dumps(a))
        write_batch.append(json.loads(d))

        # When we have enough, do some writing
        if len(write_batch) >= batch_size:
            print "writing ", len(write_batch)
            raw.bulk(connection, es_type, write_batch)
            write_batch = []

    # Write the last part-batch to index
    if len(write_batch) > 0:
        print "writing ", len(write_batch)
        raw.bulk(connection, es_type, write_batch)
Пример #4
0
def scroll_edit(connection, es_type, query):
    """ Use a scroll search to update a field based on a given query """
    write_batch = []
    batch_size = 200

    for a in tasks.scroll(connection, type=es_type, q=query):

        # Substitute the text and add to the write batch
        d = match_hindawi_urls.sub(replacement_text, json.dumps(a))
        write_batch.append(json.loads(d))

        # When we have enough, do some writing
        if len(write_batch) >= batch_size:
            print("writing ", len(write_batch))
            raw.bulk(connection, es_type, write_batch)
            write_batch = []

    # Write the last part-batch to index
    if len(write_batch) > 0:
        print("writing ", len(write_batch))
        raw.bulk(connection, es_type, write_batch)
Пример #5
0
            }
        }
    }

# Connection to the ES index
conn = raw.make_connection(None, 'localhost', 9200, 'doaj')

# Edit the Journals

write_batch = []
batch_size = 1000

for article_issn in missed_articles:
    query['query']['query_string']['query'] = article_issn

    for a in tasks.scroll(conn, 'article', query):
        try:
            article_model = models.Article(_source=a)
            a_license = article_model.data.get('index')['license']
            # Change the license
            article_model.data.get('index')['license'] = [license_correct_dict[a_license[0]]]
            write_batch.append(article_model.data)
        except ValueError:
            print "Failed to create a model"
        except KeyError:
            print "No license to change"

    # When we have enough, do some writing
    if len(write_batch) >= batch_size:
        print "writing ", len(write_batch)
        models.Article.bulk(write_batch)
Пример #6
0
            }
        }
    }

# Connection to the ES index
conn = raw.make_connection(None, 'localhost', 9200, 'doaj')

# Edit the Journals

write_batch = []
batch_size = 1000

for article_issn in missed_articles:
    query['query']['query_string']['query'] = article_issn

    for a in tasks.scroll(conn, 'article', query):
        try:
            article_model = models.Article(_source=a)
            a_license = article_model.data.get('index')['license']
            # Change the license
            article_model.data.get('index')['license'] = [
                license_correct_dict[a_license[0]]
            ]
            write_batch.append(article_model.data)
        except ValueError:
            print "Failed to create a model"
        except KeyError:
            print "No license to change"

    # When we have enough, do some writing
    if len(write_batch) >= batch_size:
Пример #7
0
write_batch = []
batch_size = 1000

edited = 0
failed = 0
unchanged = 0
nolicence = 0

ed = []
fa = []
un = []
nl = []

# Process the previous set of journals
for j in tasks.scroll(conn, 'journal'):
    try:
        journal_model = models.Journal(_source=j)
        # Change the license
        j_license = journal_model.bibjson().get_license()
        if j_license:
            j_license['type'] = license_correct_dict[j_license['type']]
            j_license['title'] = license_correct_dict[j_license['title']]
            print "edited\t{0}".format(journal_model.id)
            ed.append(journal_model.id)
            edited += 1
            journal_model.prep()
            write_batch.append(journal_model.data)
        else:
            nolicence += 1
            print "no licence\t{0}".format(journal_model.id)