def prepare_articles(names):
    '''saves tagged articles about given entities in a cache'''
    for f in glob.glob(join(raw_articles_path, "*.txt*")):
        os.remove(f)
    found = False
    link_dictionaries = {}
    for i, name in enumerate(names):
        try:
            get_article(name)
        except ArticleNotFoundError:
            try:
                article, link_dictionary = get_raw_article(name)
                link_dictionaries[i] = link_dictionary
            except ArticleNotFoundError:
                continue
            found = True
            article = '\n'.join(article.split('\n')[: article_sentence_limit])
            out = copen(join(raw_articles_path, '%d.txt' % i), 'w', 'utf-8')
            print >>out, article
    if found:
        articles = lt.run_nlptools(link_dictionaries)
        for f in glob.glob(join(raw_articles_path, "*.txt*")):
            os.remove(f)
        #save processed articles
        for i, article in articles.iteritems():
            Pickler.store(article, articles_cache_path % names[i])
Пример #2
0
def prepare_articles(names):
    '''saves tagged articles about given entities in a cache'''
    for f in glob.glob(join(raw_articles_path, "*.txt*")):
        os.remove(f)
    found = False
    link_dictionaries = {}
    for i, name in enumerate(names):
        try:
            get_article(name)
        except ArticleNotFoundError:
            try:
                article, link_dictionary = get_raw_article(name)
                link_dictionaries[i] = link_dictionary
            except ArticleNotFoundError:
                continue
            found = True
            article = '\n'.join(article.split('\n')[:article_sentence_limit])
            out = copen(join(raw_articles_path, '%d.txt' % i), 'w', 'utf-8')
            print >> out, article
    if found:
        articles = lt.run_nlptools(link_dictionaries)
        for f in glob.glob(join(raw_articles_path, "*.txt*")):
            os.remove(f)
        #save processed articles
        for i, article in articles.iteritems():
            Pickler.store(article, articles_cache_path % names[i])
Пример #3
0
 def __init__(self, predicate, sentence_limit=None, confidence_level=0.7):
     self.predicate = predicate
     self.predicate_words = map(lambda w: w.lower(), split_camelcase(unquote(predicate)))
     self.confidence_level = confidence_level
     self.sentence_limit = sentence_limit
     self.train()
     if save_to_cache:
         # pickle can't save a function, so it's removed before saving
         self.classifier.set_params(v__analyzer=None)
         Pickler.store(self, models_cache_path % ("svmmodel-%s.pkl" % predicate))
         self.classifier.set_params(v__analyzer=lambda x: x)
Пример #4
0
 def __init__(self, predicate, sentence_limit=None, confidence_level=.7):
     self.predicate = predicate
     self.predicate_words = map(lambda w: w.lower(),
                                split_camelcase(unquote(predicate)))
     self.confidence_level = confidence_level
     self.sentence_limit = sentence_limit
     self.train()
     if save_to_cache:
         #pickle can't save a function, so it's removed before saving
         self.classifier.set_params(v__analyzer=None)
         Pickler.store(self,
                       models_cache_path % ('svmmodel-%s.pkl' % predicate))
         self.classifier.set_params(v__analyzer=lambda x: x)
Пример #5
0
def collect_entities():
    try:
        return Pickler.load(entities_path)
    except IOError:
        pass
    entities = defaultdict(list)
    for i, type in enumerate(entities_types):
        entities_of_type = select_entities_of_type(full_type_name(type))
        for entity in entities_of_type:
            entities[entity].append(i)
            if "_(" in entity:
                entities[entity.split("_(")[0]].append(i)
    Pickler.store(entities, entities_path)
    return entities
Пример #6
0
def collect_entities():
    try:
        return Pickler.load(entities_path)
    except IOError:
        pass
    entities = defaultdict(list)
    for i, type in enumerate(entities_types):
        entities_of_type = select_entities_of_type(full_type_name(type))
        for entity in entities_of_type:
            entities[entity].append(i)
            if '_(' in entity:
                entities[entity.split('_(')[0]].append(i)
    Pickler.store(entities, entities_path)
    return entities
Пример #7
0
 def get_candidates(predicate):
     try:
         return Pickler.load(candidates_cache_path % predicate)
     except IOError:
         pass
     types = CandidatesSelector.get_most_specific_types(
         CandidatesSelector.get_predominant_types(predicate))
     if types:
         candidates = select_entities_of_type_not_in_relation(
             types[0], predicate)
         if predicate == 'gmina':
             candidates = filter(lambda e: 'Gmina' not in e, candidates)
         if predicate == 'powiat':
             candidates = filter(lambda e: 'Powiat' not in e, candidates)
         if predicate == 'hrabstwo':
             candidates = filter(
                 lambda e: 'hrabstwo_miejskie' not in e and 'Hrabstwo'
                 not in e, candidates)
         Pickler.store(candidates, candidates_cache_path % predicate)
         return candidates
     else:
         return []
 def get_candidates(predicate):
     try:
         return Pickler.load(candidates_cache_path % predicate)
     except IOError:
         pass
     types = CandidatesSelector.get_most_specific_types(
         CandidatesSelector.get_predominant_types(predicate)
     )
     if types: 
         candidates = select_entities_of_type_not_in_relation(
             types[0], 
             predicate
         )
         if predicate == 'gmina':
             candidates = filter(lambda e: 'Gmina' not in e, candidates)
         if predicate == 'powiat':
             candidates = filter(lambda e: 'Powiat' not in e, candidates)
         if predicate == 'hrabstwo':
             candidates = filter(lambda e: 'hrabstwo_miejskie' not in e and 'Hrabstwo' not in e, candidates)
         Pickler.store(candidates, candidates_cache_path % predicate)
         return candidates
     else:
         return []