def extract_nlp(self, text): digest_data = 'nlptags.cache_' + \ md5(text.encode('ascii', 'ignore')).hexdigest() if not os.path.exists(self.cachedir): print >> sys.stderr, '[cache error] directory %s does not exists' % self.cachedir try: print '[cache info] Creating caches directory' os.makedirs(self.cachedir) except: print >> sys.stderr, '[cache error] Failed to create caches directory' sys.exit(1) cache_path = os.path.join(self.cachedir, digest_data) if os.path.exists(cache_path): with codecs.open(cache_path, mode='rb', encoding='utf-8') as f: return json.load(f) else: nlptags = [] words = [] parsed = simplejson.loads(self.server.parse(text)) for st in parsed['sentences']: nlptags.append(self.parse_parsetree(st['parsetree'])) words.append(st['words']) out = prep_for_json({'parsetree': nlptags, 'words': words}) with codecs.open(cache_path, mode='wb', encoding='utf-8') as f: json.dump(out, f) return out
def extract_nlp_batch(self, input_list): """ Extract NLP information from `input_list` Returns a <dict> {`sentences<list>`: nlp info, `corefs<list>`: coreference info} `sentences` is a list of nlp info corresponding to entries in `input_list` See method *parse* for more info """ digest_data = 'nlptags_batch.cache_' + \ md5(str(input_list).encode('ascii', 'ignore')).hexdigest() if not os.path.exists(self.cachedir): print >> sys.stderr, '[cache error] directory %s does not exists' % self.cachedir try: print '[cache info] Creating caches directory' os.makedirs(self.cachedir) except: print >> sys.stderr, '[cache error] Failed to create caches directory' sys.exit(1) cache_path = os.path.join(self.cachedir, digest_data) if os.path.exists(cache_path): with codecs.open(cache_path, mode='rb', encoding='utf-8') as f: return json.load(f) else: nlptags = [] corefs = [] for i in input_list: parsed = self.parse(i) nlptags.append(parsed['sentences']) corefs.append(parsed['coref']) res = prep_for_json({'sentences': nlptags, 'corefs': corefs}) with codecs.open(cache_path, mode='wb', encoding='utf-8') as f: json.dump(out, f) return res
def batch_filtered(data, semtypes, fields=None, cachedir='../cache', mmjar=MMJAR_PATH, no_cache=False, long_concepts=True): """ exactly like batch with the following difference: It returns only concepts and their semantic types and only for semantic types that are indicated in semtypes the output format is a list of concepts with only concept name and its semantic type default semtypes path is: data/semantic_types.json """ if not no_cache: digest_data = ('batch_mm_filtered_{0}.cache' ''.format(md5(json.dumps([fields, data])).hexdigest())) if not os.path.exists(cachedir): print >> sys.stderr, '[cache error] %s does not exists' % cachedir sys.exit(1) cache_path = os.path.join(cachedir, digest_data) if os.path.exists(cache_path): with codecs.open(cache_path, mode='rb', encoding='utf-8') as f: return json.load(f) out = [] for elem in data: # if fields is not specified, then elem_fields == elem.keys() # otherwise list comprehension acts like a filter function elem_fields = [k for k in elem.keys() if (not(fields) or (k in fields))] result = {fl: run(elem[fl], no_cache=True, mmjar=mmjar, long_concepts=long_concepts) for fl in elem_fields} found_concepts = {} for fl in elem_fields: found_concepts[fl] = [] for concept in result[fl]['txt']['concepts']: if str(concept['semtype'][0]) in semtypes: found_concept = {} found_concept['cname'] = str(concept['cname']) for t in concept['semtype']: found_concept['ctype'] = t found_concepts[fl].append(found_concept) out.append(found_concepts) if not no_cache: out = prep_for_json(out) print out with codecs.open(cache_path, mode='wb', encoding='utf-8') as f: json.dump(out, f) return out
def batch(data, fields=None, cachedir='../cache', mmjar=MMJAR_PATH, no_cache=False, long_concepts=True): """ batch process all the elements in data and cache them in a single file (reduces IO time). data is an list of dictionaries. If fields=None, then all the fields in every dictionary are cached; fields should be a list/tuple containing the relevant fields to consider. """ if not no_cache: digest_data = ('batch_mm_{0}.cache' ''.format(md5(json.dumps([fields, data])).hexdigest())) if not os.path.exists(cachedir): print >> sys.stderr, '[cache error] %s does not exists' % cachedir sys.exit(1) cache_path = os.path.join(cachedir, digest_data) if os.path.exists(cache_path): with codecs.open(cache_path, mode='rb', encoding='utf-8') as f: return json.load(f) out = [] for elem in data: # if fields is not specified, then elem_fields == elem.keys() # otherwise list comprehension acts like a filter function elem_fields = [ k for k in elem.keys() if (not (fields) or (k in fields)) ] result = { fl: run(elem[fl], no_cache=True, mmjar=mmjar, long_concepts=long_concepts) for fl in elem_fields } out.append(result) if not no_cache: out = prep_for_json(out) with codecs.open(cache_path, mode='wb', encoding='utf-8') as f: json.dump(out, f) return out
def batch(data, fields=None, cachedir='../cache', mmjar=MMJAR_PATH, no_cache=False, long_concepts=True): """ batch process all the elements in data and cache them in a single file (reduces IO time). data is an list of dictionaries. If fields=None, then all the fields in every dictionary are cached; fields should be a list/tuple containing the relevant fields to consider. """ if not no_cache: digest_data = ('batch_mm_{0}.cache' ''.format(md5(json.dumps([fields, data])).hexdigest())) if not os.path.exists(cachedir): print >> sys.stderr, '[cache error] %s does not exists' % cachedir sys.exit(1) cache_path = os.path.join(cachedir, digest_data) if os.path.exists(cache_path): with codecs.open(cache_path, mode='rb', encoding='utf-8') as f: return json.load(f) out = [] for elem in data: # if fields is not specified, then elem_fields == elem.keys() # otherwise list comprehension acts like a filter function elem_fields = [k for k in elem.keys() if (not(fields) or (k in fields))] result = {fl: run(elem[fl], no_cache=True, mmjar=mmjar, long_concepts=long_concepts) for fl in elem_fields} out.append(result) if not no_cache: out = prep_for_json(out) with codecs.open(cache_path, mode='wb', encoding='utf-8') as f: json.dump(out, f) return out
def extract_nlp_batch(self, input_list): """ Extract NLP information from `input_list` Returns a <dict> {`sentences<list>`: nlp info, `corefs<list>`: coreference info} `sentences` is a list of nlp info corresponding to entries in `input_list` See method *parse* for more info """ digest_data = 'nlptags_batch.cache_' + \ md5(str(input_list).encode('ascii', 'ignore')).hexdigest() if not os.path.exists(self.cachedir): print >> sys.stderr, '[cache error] directory %s does not exists' % self.cachedir try: print '[cache info] Creating caches directory' os.makedirs(self.cachedir) except: print >> sys.stderr, '[cache error] Failed to create caches directory' sys.exit(1) cache_path = os.path.join(self.cachedir, digest_data) if os.path.exists(cache_path): with codecs.open(cache_path, mode='rb', encoding='utf-8') as f: return json.load(f) else: nlptags = [] corefs = [] for i in input_list: parsed = self.parse(i) nlptags.append(parsed['sentences']) corefs.append(parsed['coref']) res = prep_for_json( {'sentences': nlptags, 'corefs': corefs}) with codecs.open(cache_path, mode='wb', encoding='utf-8') as f: json.dump(out, f) return res
def batch_filtered(data, semtypes, fields=None, cachedir='../cache', mmjar=MMJAR_PATH, no_cache=False, long_concepts=True): """ exactly like batch with the following difference: It returns only concepts and their semantic types and only for semantic types that are indicated in semtypes the output format is a list of concepts with only concept name and its semantic type default semtypes path is: data/semantic_types.json """ if not no_cache: digest_data = ('batch_mm_filtered_{0}.cache' ''.format(md5(json.dumps([fields, data])).hexdigest())) if not os.path.exists(cachedir): print >> sys.stderr, '[cache error] %s does not exists' % cachedir sys.exit(1) cache_path = os.path.join(cachedir, digest_data) if os.path.exists(cache_path): with codecs.open(cache_path, mode='rb', encoding='utf-8') as f: return json.load(f) out = [] for elem in data: # if fields is not specified, then elem_fields == elem.keys() # otherwise list comprehension acts like a filter function elem_fields = [ k for k in elem.keys() if (not (fields) or (k in fields)) ] result = { fl: run(elem[fl], no_cache=True, mmjar=mmjar, long_concepts=long_concepts) for fl in elem_fields } found_concepts = {} for fl in elem_fields: found_concepts[fl] = [] for concept in result[fl]['txt']['concepts']: if str(concept['semtype'][0]) in semtypes: found_concept = {} found_concept['cname'] = str(concept['cname']) for t in concept['semtype']: found_concept['ctype'] = t found_concepts[fl].append(found_concept) out.append(found_concepts) if not no_cache: out = prep_for_json(out) print out with codecs.open(cache_path, mode='wb', encoding='utf-8') as f: json.dump(out, f) return out