def index_worker(index, ipath): """Indexing. Parameters ---------- index : str Name of index. ipath : str Path to raw json files. """ opath = os.path.join('tmp.json') for chunk in split_file(ipath, 10000): with open(opath, 'w') as ofp: for doc in chunk: if 'title' not in doc or 'abstract' not in doc: global drop_count drop_count += 1 continue doc['title'] = doc['title'].lower().strip() doc['abstract'] = doc['abstract'].lower() doc['keywords'] = [e.lower().strip() for e in doc.get('keywords', [])], doc['fos'] = [e.lower().strip() for e in doc.get('fos', [])], json.dump({'index': {'_index': index}}, ofp) ofp.write('\n') json.dump(doc, ofp) ofp.write('\n') bulk_insert(index, opath) refresh(args.index) os.remove('tmp.json')
def index_patent(ipath): """Read patent.tsv. Parameters ---------- ipath : str Path to patent.tsv. Returns ------- :class:`pandas.DataFrame` Data on granted patents. """ print('Indexing patent.') index_name = 'patent_tmp' opath = os.path.join(os.path.dirname(ipath), 'patent.index.tmp.json') create_index(index_name) chunks = pd.read_csv(ipath, sep='\t', quoting=3, lineterminator='\n', dtype=str, chunksize=50000) for chunk in chunks: chunk.drop(columns=[ 'type', 'number', 'country', 'kind', 'num_claims', 'filename', 'withdrawn' ], inplace=True) chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce') chunk.dropna(axis='index', subset=['id', 'title', 'abstract'], how='any', inplace=True) with open(opath, 'w') as ofp: for _, patent in chunk.iterrows(): json.dump({'index': {'_index': index_name}}, ofp) ofp.write('\n') json.dump( { 'id': patent['id'], 'date': str(patent['date'].date()), 'title': patent['title'].lower().strip(), 'abstract': patent['abstract'].lower().strip() }, ofp) ofp.write('\n') bulk_insert(index_name, opath) refresh(index_name) os.remove(opath)
def index_claim(ipath): """Read claim.tsv. Parameters ---------- ipath : str Path to patent.tsv. Returns ------- :class:`pandas.DataFrame` Patent claims and their dependency. """ print('Indexing claim.') index_name = 'claim_tmp' opath = os.path.join(os.path.dirname(ipath), 'claim.index.tmp.json') create_index(index_name) chunks = pd.read_csv(ipath, sep='\t', quoting=3, lineterminator='\n', dtype=str, chunksize=50000) for chunk in chunks: chunk.drop(columns=['dependent', 'sequence', 'exemplary'], inplace=True) chunk.dropna(axis='index', subset=['uuid', 'text', 'patent_id'], how='any', inplace=True) with open(opath, 'w') as ofp: for _, claim in chunk.iterrows(): json.dump({'index': {'_index': index_name}}, ofp) ofp.write('\n') json.dump( { 'id': claim['patent_id'], 'text': claim['text'].lower().strip() }, ofp) ofp.write('\n') bulk_insert(index_name, opath) refresh(index_name) os.remove(opath)