def index_documents(options, args): for arg in args: mime_type = TikaParser.get_mime_type(arg) (text, meta) = TikaParser.parse(arg) if not text.keys(): if 'text/plain' in meta['content_type']: (text, txt_meta) = TextParser.parse(arg) meta.update(txt_meta) elif 'vnd.oasis.opendocument' in meta['content_type']: (text, od_meta) = OpenDocumentParser.parse(arg) meta.update(od_meta) (mpty, fs_meta) = FsMetaParser.parse(arg) meta.update(fs_meta) meta['content_type'] = mime_type for field in meta: print("{}: {}".format(field, meta.get(field))) # if meta.get('content_type', '') == 'application/pdf': # es_index(text, meta, doctype='pdf', options=options) print(text) exit()
def get_terms(self, query_text): query_terms = TextParser.parse(query_text) query_term_ids = [self.term_store.get_id_for_term(term) for term in query_terms] return query_term_ids