def prepare_articles(names): '''saves tagged articles about given entities in a cache''' for f in glob.glob(join(raw_articles_path, "*.txt*")): os.remove(f) found = False link_dictionaries = {} for i, name in enumerate(names): try: get_article(name) except ArticleNotFoundError: try: article, link_dictionary = get_raw_article(name) link_dictionaries[i] = link_dictionary except ArticleNotFoundError: continue found = True article = '\n'.join(article.split('\n')[: article_sentence_limit]) out = copen(join(raw_articles_path, '%d.txt' % i), 'w', 'utf-8') print >>out, article if found: articles = lt.run_nlptools(link_dictionaries) for f in glob.glob(join(raw_articles_path, "*.txt*")): os.remove(f) #save processed articles for i, article in articles.iteritems(): Pickler.store(article, articles_cache_path % names[i])
def prepare_articles(names): '''saves tagged articles about given entities in a cache''' for f in glob.glob(join(raw_articles_path, "*.txt*")): os.remove(f) found = False link_dictionaries = {} for i, name in enumerate(names): try: get_article(name) except ArticleNotFoundError: try: article, link_dictionary = get_raw_article(name) link_dictionaries[i] = link_dictionary except ArticleNotFoundError: continue found = True article = '\n'.join(article.split('\n')[:article_sentence_limit]) out = copen(join(raw_articles_path, '%d.txt' % i), 'w', 'utf-8') print >> out, article if found: articles = lt.run_nlptools(link_dictionaries) for f in glob.glob(join(raw_articles_path, "*.txt*")): os.remove(f) #save processed articles for i, article in articles.iteritems(): Pickler.store(article, articles_cache_path % names[i])
def __init__(self, predicate, sentence_limit=None, confidence_level=0.7): self.predicate = predicate self.predicate_words = map(lambda w: w.lower(), split_camelcase(unquote(predicate))) self.confidence_level = confidence_level self.sentence_limit = sentence_limit self.train() if save_to_cache: # pickle can't save a function, so it's removed before saving self.classifier.set_params(v__analyzer=None) Pickler.store(self, models_cache_path % ("svmmodel-%s.pkl" % predicate)) self.classifier.set_params(v__analyzer=lambda x: x)
def __init__(self, predicate, sentence_limit=None, confidence_level=.7): self.predicate = predicate self.predicate_words = map(lambda w: w.lower(), split_camelcase(unquote(predicate))) self.confidence_level = confidence_level self.sentence_limit = sentence_limit self.train() if save_to_cache: #pickle can't save a function, so it's removed before saving self.classifier.set_params(v__analyzer=None) Pickler.store(self, models_cache_path % ('svmmodel-%s.pkl' % predicate)) self.classifier.set_params(v__analyzer=lambda x: x)
def collect_entities(): try: return Pickler.load(entities_path) except IOError: pass entities = defaultdict(list) for i, type in enumerate(entities_types): entities_of_type = select_entities_of_type(full_type_name(type)) for entity in entities_of_type: entities[entity].append(i) if "_(" in entity: entities[entity.split("_(")[0]].append(i) Pickler.store(entities, entities_path) return entities
def collect_entities(): try: return Pickler.load(entities_path) except IOError: pass entities = defaultdict(list) for i, type in enumerate(entities_types): entities_of_type = select_entities_of_type(full_type_name(type)) for entity in entities_of_type: entities[entity].append(i) if '_(' in entity: entities[entity.split('_(')[0]].append(i) Pickler.store(entities, entities_path) return entities
def get_candidates(predicate): try: return Pickler.load(candidates_cache_path % predicate) except IOError: pass types = CandidatesSelector.get_most_specific_types( CandidatesSelector.get_predominant_types(predicate)) if types: candidates = select_entities_of_type_not_in_relation( types[0], predicate) if predicate == 'gmina': candidates = filter(lambda e: 'Gmina' not in e, candidates) if predicate == 'powiat': candidates = filter(lambda e: 'Powiat' not in e, candidates) if predicate == 'hrabstwo': candidates = filter( lambda e: 'hrabstwo_miejskie' not in e and 'Hrabstwo' not in e, candidates) Pickler.store(candidates, candidates_cache_path % predicate) return candidates else: return []
def get_candidates(predicate): try: return Pickler.load(candidates_cache_path % predicate) except IOError: pass types = CandidatesSelector.get_most_specific_types( CandidatesSelector.get_predominant_types(predicate) ) if types: candidates = select_entities_of_type_not_in_relation( types[0], predicate ) if predicate == 'gmina': candidates = filter(lambda e: 'Gmina' not in e, candidates) if predicate == 'powiat': candidates = filter(lambda e: 'Powiat' not in e, candidates) if predicate == 'hrabstwo': candidates = filter(lambda e: 'hrabstwo_miejskie' not in e and 'Hrabstwo' not in e, candidates) Pickler.store(candidates, candidates_cache_path % predicate) return candidates else: return []