예제 #1
0
def index_patent(ipath):
    """Read patent.tsv.

    Parameters
    ----------
    ipath : str
        Path to patent.tsv.

    Returns
    -------
    :class:`pandas.DataFrame`
        Data on granted patents.

    """

    print('Indexing patent.')
    index_name = 'patent_tmp'
    opath = os.path.join(os.path.dirname(ipath), 'patent.index.tmp.json')
    create_index(index_name)
    chunks = pd.read_csv(ipath,
                         sep='\t',
                         quoting=3,
                         lineterminator='\n',
                         dtype=str,
                         chunksize=50000)
    for chunk in chunks:
        chunk.drop(columns=[
            'type', 'number', 'country', 'kind', 'num_claims', 'filename',
            'withdrawn'
        ],
                   inplace=True)
        chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce')
        chunk.dropna(axis='index',
                     subset=['id', 'title', 'abstract'],
                     how='any',
                     inplace=True)
        with open(opath, 'w') as ofp:
            for _, patent in chunk.iterrows():
                json.dump({'index': {'_index': index_name}}, ofp)
                ofp.write('\n')
                json.dump(
                    {
                        'id': patent['id'],
                        'date': str(patent['date'].date()),
                        'title': patent['title'].lower().strip(),
                        'abstract': patent['abstract'].lower().strip()
                    }, ofp)
                ofp.write('\n')
        bulk_insert(index_name, opath)
    refresh(index_name)
    os.remove(opath)
예제 #2
0
def create_es_index():
    """
    Initialize a database and create the table if not present and return True
    """
    global es_conn
    es_conn = es.connect_elasticsearch()
    created = es.create_index(es_conn, 'matches', index_settings())
예제 #3
0
    def on_post(self, req, resp):
        cmd = req.get_param('cmd')

        result = {}
        if cmd == 'add':
            book = req.get_param('book')
            file_path = save_file(book)
            task_data = {'path': file_path}
            try:
                add_book_task.delay(task_data)
                result = {'msg': 'file putted in queue'}
            except Exception as e:
                result = {'error': str(e)}
                delete_file(file_path)
        elif cmd == 'create':
            result = create_index()
        elif cmd == 'delete':
            result = delete_index()
        elif cmd == 'count':
            result = count_items()
        elif cmd == 'search':
            q = req.get_param('q')
            result = search(q)
        elif cmd == 'search_advanced':
            q = req.get_param('q')
            result = search_advanced(q)

        resp.body = json.dumps(result)
        resp.status = falcon.HTTP_200
예제 #4
0
파일: utils.py 프로젝트: jimjkelly/printen
def create_indices(indices=None, set_aliases=True):
    result = []
    aliases = []
    indices = indices or []

    now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    for index_alias, type_classes in get_indices(indices).items():
        index_settings = recursive_dict_update(
            getattr(
                config.settings,
                'ELASTICSEARCH_DEFAULT_INDEX_SETTINGS',
                {}
            ),
            getattr(
                config.settings,
                'ELASTICSEARCH_CUSTOM_INDEX_SETTINGS',
                {}
            ).get(index_alias, {})
        )

        index_name = '{0}-{1}'.format(index_alias, now)

        aliases.append((index_alias, index_name))

        type_mappings = {}
        for type_class in type_classes:
            tmp = type_class.get_type_mapping()
            if tmp:
                type_mappings[type_class.get_type_name()] = tmp

            result.append((
                type_class,
                index_alias,
                index_name
            ))

        # if we got any type mappings, put them in the index settings
        if type_mappings:
            index_settings['mappings'] = type_mappings

        es.create_index(index_name, index_settings)

    if set_aliases:
        create_aliases(aliases)

    return result, aliases
예제 #5
0
def index_claim(ipath):
    """Read claim.tsv.

    Parameters
    ----------
    ipath : str
        Path to patent.tsv.

    Returns
    -------
    :class:`pandas.DataFrame`
        Patent claims and their dependency.

    """

    print('Indexing claim.')
    index_name = 'claim_tmp'
    opath = os.path.join(os.path.dirname(ipath), 'claim.index.tmp.json')
    create_index(index_name)
    chunks = pd.read_csv(ipath,
                         sep='\t',
                         quoting=3,
                         lineterminator='\n',
                         dtype=str,
                         chunksize=50000)
    for chunk in chunks:
        chunk.drop(columns=['dependent', 'sequence', 'exemplary'],
                   inplace=True)
        chunk.dropna(axis='index',
                     subset=['uuid', 'text', 'patent_id'],
                     how='any',
                     inplace=True)
        with open(opath, 'w') as ofp:
            for _, claim in chunk.iterrows():
                json.dump({'index': {'_index': index_name}}, ofp)
                ofp.write('\n')
                json.dump(
                    {
                        'id': claim['patent_id'],
                        'text': claim['text'].lower().strip()
                    }, ofp)
                ofp.write('\n')
        bulk_insert(index_name, opath)
    refresh(index_name)
    os.remove(opath)
예제 #6
0
import es
import json
import glob
import time

es.delete_index("messages")
es.create_index("messages")

nb = 0
for filename in glob.iglob('data/**/*.json', recursive=True):
    nb += 1
    if nb % 500 == 0:
        time.sleep(1)
    with open(filename, encoding="utf8") as f:
        item = json.load(f)
        es.index("messages", "message", item)