예제 #1
0
def index_worker(index, ipath):
    """Indexing.

    Parameters
    ----------
    index : str
        Name of index.
    ipath : str
        Path to raw json files.

    """

    opath = os.path.join('tmp.json')
    for chunk in split_file(ipath, 10000):
        with open(opath, 'w') as ofp:
            for doc in chunk:
                if 'title' not in doc or 'abstract' not in doc:
                    global drop_count
                    drop_count += 1
                    continue
                doc['title'] = doc['title'].lower().strip()
                doc['abstract'] = doc['abstract'].lower()
                doc['keywords'] = [e.lower().strip()
                                   for e in doc.get('keywords', [])],
                doc['fos'] = [e.lower().strip() for e in doc.get('fos', [])],
                json.dump({'index': {'_index': index}}, ofp)
                ofp.write('\n')
                json.dump(doc, ofp)
                ofp.write('\n')
        bulk_insert(index, opath)
    refresh(args.index)
    os.remove('tmp.json')
예제 #2
0
def index_patent(ipath):
    """Read patent.tsv.

    Parameters
    ----------
    ipath : str
        Path to patent.tsv.

    Returns
    -------
    :class:`pandas.DataFrame`
        Data on granted patents.

    """

    print('Indexing patent.')
    index_name = 'patent_tmp'
    opath = os.path.join(os.path.dirname(ipath), 'patent.index.tmp.json')
    create_index(index_name)
    chunks = pd.read_csv(ipath,
                         sep='\t',
                         quoting=3,
                         lineterminator='\n',
                         dtype=str,
                         chunksize=50000)
    for chunk in chunks:
        chunk.drop(columns=[
            'type', 'number', 'country', 'kind', 'num_claims', 'filename',
            'withdrawn'
        ],
                   inplace=True)
        chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce')
        chunk.dropna(axis='index',
                     subset=['id', 'title', 'abstract'],
                     how='any',
                     inplace=True)
        with open(opath, 'w') as ofp:
            for _, patent in chunk.iterrows():
                json.dump({'index': {'_index': index_name}}, ofp)
                ofp.write('\n')
                json.dump(
                    {
                        'id': patent['id'],
                        'date': str(patent['date'].date()),
                        'title': patent['title'].lower().strip(),
                        'abstract': patent['abstract'].lower().strip()
                    }, ofp)
                ofp.write('\n')
        bulk_insert(index_name, opath)
    refresh(index_name)
    os.remove(opath)
예제 #3
0
def index_claim(ipath):
    """Read claim.tsv.

    Parameters
    ----------
    ipath : str
        Path to patent.tsv.

    Returns
    -------
    :class:`pandas.DataFrame`
        Patent claims and their dependency.

    """

    print('Indexing claim.')
    index_name = 'claim_tmp'
    opath = os.path.join(os.path.dirname(ipath), 'claim.index.tmp.json')
    create_index(index_name)
    chunks = pd.read_csv(ipath,
                         sep='\t',
                         quoting=3,
                         lineterminator='\n',
                         dtype=str,
                         chunksize=50000)
    for chunk in chunks:
        chunk.drop(columns=['dependent', 'sequence', 'exemplary'],
                   inplace=True)
        chunk.dropna(axis='index',
                     subset=['uuid', 'text', 'patent_id'],
                     how='any',
                     inplace=True)
        with open(opath, 'w') as ofp:
            for _, claim in chunk.iterrows():
                json.dump({'index': {'_index': index_name}}, ofp)
                ofp.write('\n')
                json.dump(
                    {
                        'id': claim['patent_id'],
                        'text': claim['text'].lower().strip()
                    }, ofp)
                ofp.write('\n')
        bulk_insert(index_name, opath)
    refresh(index_name)
    os.remove(opath)