예제 #1
0
        def _train(train_raw):
            models = {}
            all_docs = {}
            for topic in train_raw:
                x_train = []
                y_train = []
                for inst in train_raw[topic]:

                    feature_vector = [
                        ext.extract(inst[0], inst[1])
                        for ext in self.extractors
                    ]
                    x_train.append(feature_vector)
                    y_train.append(inst[2])
            svm = Supervised(self.args, self.opts)
            if not os.path.exists(constants.get_path()['tmp'] +
                                  '/ltr-features-all'):
                with open(constants.get_path()['tmp'] + '/ltr-features-all',
                          'wb') as mf:
                    json.dump({
                        'x_train': x_train,
                        'y_train': y_train
                    },
                              mf,
                              indent=2)
            svm.train(x_train, y_train)
            all_docs = [
                inst[1] for topc in train_raw for inst in train_raw[topc]
            ]
            return svm, all_docs
예제 #2
0
        def _train(train_raw):
            models = {}
            all_docs = {}
            for topic in train_raw:
                x_train = []
                y_train = []
                for inst in train_raw[topic]:

                    feature_vector = [
                        ext.extract(inst[0], inst[1])
                        for ext in self.extractors
                    ]
                    x_train.append(feature_vector)
                    y_train.append(inst[2])
                svm = Supervised(self.args, self.opts)
                with open(
                        constants.get_path()['tmp'] +
                        '/ltr-features-%s' % topic, 'wb') as mf:
                    json.dump({
                        'x_train': x_train,
                        'y_train': y_train
                    },
                              mf,
                              indent=2)
                svm.train(x_train, y_train)
                models[topic.lower()] = svm
                all_docs[topic] = [inst[1] for inst in train_raw[topic]]
                return models, all_docs
예제 #3
0
    def train(self, train_set):
        '''
        The training data come from docs_path and json_data_path
            In init: self.train_raw

        '''
        @object_hashing(
            cache_comment='svm_models_%s' % hash_obj(
                train_set),
            cachedir=constants.get_path()['cache'])
        def _train(train_raw):
            models = {}
            all_docs = {}
            for topic in train_raw:
                x_train = []
                y_train = []
                for inst in train_raw[topic]:

                    feature_vector = [
                        ext.extract(inst[0], inst[1]) for ext in self.extractors]
                    x_train.append(feature_vector)
                    y_train.append(inst[2])
            svm = Supervised(self.args, self.opts)
            if not os.path.exists(constants.get_path()['tmp'] + '/ltr-features-all'):
                with open(constants.get_path()['tmp'] + '/ltr-features-all', 'wb') as mf:
                    json.dump(
                        {'x_train': x_train, 'y_train': y_train}, mf, indent=2)
            svm.train(x_train, y_train)
            all_docs = [inst[1]
                        for topc in train_raw for inst in train_raw[topc]]
            return svm, all_docs
        self.model, self.all_docs =\
            _train(train_set)
예제 #4
0
    def train(self, train_set):
        '''
        Train_set is null,
        the training data come from docs_path and json_data_path

        '''
        @object_hashing(
            cache_comment='svm_models_%s' % hash_obj(
                train_set),
            cachedir=constants.get_path()['cache'])
        def _train(train_raw):
            models = {}
            all_docs = {}
            for topic in train_raw:
                x_train = []
                y_train = []
                for inst in train_raw[topic]:

                    feature_vector = [
                        ext.extract(inst[0], inst[1]) for ext in self.extractors]
                    x_train.append(feature_vector)
                    y_train.append(inst[2])
                svm = Supervised(self.args, self.opts)
                with open(constants.get_path()['tmp'] + '/ltr-features-%s' % topic, 'wb') as mf:
                    json.dump(
                        {'x_train': x_train, 'y_train': y_train}, mf, indent=2)
                svm.train(x_train, y_train)
                models[topic.lower()] = svm
                all_docs[topic] = [inst[1] for inst in train_raw[topic]]
                return models, all_docs
        self.models, self.all_docs =\
            _train(train_set)
예제 #5
0
        def _train(train_raw):
            models = {}
            all_docs = {}
            for topic in train_raw:
                x_train = []
                y_train = []
                for inst in train_raw[topic]:

                    feature_vector = [
                        ext.extract(inst[0], inst[1]) for ext in self.extractors]
                    x_train.append(feature_vector)
                    y_train.append(inst[2])
            svm = Supervised(self.args, self.opts)
            if not os.path.exists(constants.get_path()['tmp'] + '/ltr-features-all'):
                with open(constants.get_path()['tmp'] + '/ltr-features-all', 'wb') as mf:
                    json.dump(
                        {'x_train': x_train, 'y_train': y_train}, mf, indent=2)
            svm.train(x_train, y_train)
            all_docs = [inst[1]
                        for topc in train_raw for inst in train_raw[topc]]
            return svm, all_docs
예제 #6
0
 def __init__(self, host="localhost", port=9200, index_name="biosum", cred_path=".cred"):
     self.host = host
     self.port = port
     self.index_name = index_name
     self.cred_path = cred_path
     # self.doc_type = 'papers'
     self.es = self.__connect()
     self.ic = IndicesClient(self.es)
     try:
         cache_file = constants.get_path()["cache"]
         self.page_cache = shelve.open(cache_file + "/pages.p", writeback=False)
     except:
         print "Not found: %s" % cache_file
         print sys.exc_info()[0]
         sys.exit()
예제 #7
0
파일: gui.py 프로젝트: AjaxVM/pyglibs
 def load_cursor_from_image(self, filename):
     i = pygame.image.load(constants.get_path(filename))
     i = pygame.transform.flip(i, True, False)
     i = pygame.transform.rotate(i, 90)
     size = i.get_size()
     cur = []
     for x in xrange(size[0]):
         n = ""
         for y in xrange(size[1]):
             val = i.get_at((x, y))[0:3]
             if val == (0, 0, 0):
                 n = n + "X"
             elif val == (255, 0, 0):
                 n = n + "."
             else:
                 n = n + " "
         cur.append(n)
     return (size, (0,0)) + pygame.cursors.compile(cur)
예제 #8
0
 def __init__(self,
              host='localhost',
              port=9200,
              index_name='biosum',
              cred_path='.cred'):
     self.host = host
     self.port = port
     self.index_name = index_name
     self.cred_path = cred_path
     # self.doc_type = 'papers'
     self.es = self.__connect()
     self.ic = IndicesClient(self.es)
     try:
         cache_file = constants.get_path()['cache']
         self.page_cache = shelve.open(cache_file + '/pages.p',
                                       writeback=False)
     except:
         print 'Not found: %s' % cache_file
         print sys.exc_info()[0]
         sys.exit()
예제 #9
0
        def _train(train_raw):
            models = {}
            all_docs = {}
            for topic in train_raw:
                x_train = []
                y_train = []
                for inst in train_raw[topic]:

                    feature_vector = [
                        ext.extract(inst[0], inst[1]) for ext in self.extractors]
                    x_train.append(feature_vector)
                    y_train.append(inst[2])
                svm = Supervised(self.args, self.opts)
                with open(constants.get_path()['tmp'] + '/ltr-features-%s' % topic, 'wb') as mf:
                    json.dump(
                        {'x_train': x_train, 'y_train': y_train}, mf, indent=2)
                svm.train(x_train, y_train)
                models[topic.lower()] = svm
                all_docs[topic] = [inst[1] for inst in train_raw[topic]]
                return models, all_docs
예제 #10
0
    def train(self, train_set):
        '''
        The training data come from docs_path and json_data_path
            In init: self.train_raw

        '''
        @object_hashing(cache_comment='svm_models_%s' % hash_obj(train_set),
                        cachedir=constants.get_path()['cache'])
        def _train(train_raw):
            models = {}
            all_docs = {}
            for topic in train_raw:
                x_train = []
                y_train = []
                for inst in train_raw[topic]:

                    feature_vector = [
                        ext.extract(inst[0], inst[1])
                        for ext in self.extractors
                    ]
                    x_train.append(feature_vector)
                    y_train.append(inst[2])
            svm = Supervised(self.args, self.opts)
            if not os.path.exists(constants.get_path()['tmp'] +
                                  '/ltr-features-all'):
                with open(constants.get_path()['tmp'] + '/ltr-features-all',
                          'wb') as mf:
                    json.dump({
                        'x_train': x_train,
                        'y_train': y_train
                    },
                              mf,
                              indent=2)
            svm.train(x_train, y_train)
            all_docs = [
                inst[1] for topc in train_raw for inst in train_raw[topc]
            ]
            return svm, all_docs
        self.model, self.all_docs =\
            _train(train_set)
예제 #11
0
    def train(self, train_set):
        '''
        Train_set is null,
        the training data come from docs_path and json_data_path

        '''
        @object_hashing(cache_comment='svm_models_%s' % hash_obj(train_set),
                        cachedir=constants.get_path()['cache'])
        def _train(train_raw):
            models = {}
            all_docs = {}
            for topic in train_raw:
                x_train = []
                y_train = []
                for inst in train_raw[topic]:

                    feature_vector = [
                        ext.extract(inst[0], inst[1])
                        for ext in self.extractors
                    ]
                    x_train.append(feature_vector)
                    y_train.append(inst[2])
                svm = Supervised(self.args, self.opts)
                with open(
                        constants.get_path()['tmp'] +
                        '/ltr-features-%s' % topic, 'wb') as mf:
                    json.dump({
                        'x_train': x_train,
                        'y_train': y_train
                    },
                              mf,
                              indent=2)
                svm.train(x_train, y_train)
                models[topic.lower()] = svm
                all_docs[topic] = [inst[1] for inst in train_raw[topic]]
                return models, all_docs
        self.models, self.all_docs =\
            _train(train_set)
def scrape_climate_data():
    path = get_path()
    for year in range(2013, 2019):
        for month in range(1, 13):
            if month < 10:
                url = 'https://en.tutiempo.net/climate/0{}-{}/ws-421820.html'.format(
                    month, year)
            else:
                url = 'https://en.tutiempo.net/climate/{}-{}/ws-421820.html'.format(
                    month, year)

            data = requests.get(url)
            data_utf = data.text.encode('utf=8')

            if not os.path.exists('{}/assets/climate-data/{}'.format(
                    path, year)):
                os.makedirs('{}/assets/climate-data/{}'.format(path, year))

            with open(
                    '{}/assets/climate-data/{}/{}.html'.format(
                        path, year, month), 'wb') as result:
                result.write(data_utf)

            sys.stdout.flush()
예제 #13
0
import re
import os
import sys
from functools import wraps
from constants import get_path
from pprint import pformat

from nltk.tokenize.regexp import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import codecs
import math
stemmer = PorterStemmer()
lmtzr = WordNetLemmatizer()
tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
STOPWORDS = get_path()['data'] + '/stopwords.txt'
with file(STOPWORDS) as f:
    stopwords = frozenset([l.strip().lower() for l in f])

from optparse import OptionParser

from time import time as now

try:
    import matplotlib.pyplot as plt
except ImportError:
    pass
try:
    import numpy as np
except ImportError:
    pass
예제 #14
0
파일: theme.py 프로젝트: AjaxVM/pyglibs
def get_default_theme():
    return load_theme(constants.get_path("default_theme"))
예제 #15
0
from rerank.null import Reranker as RerankInterface
import json
import codecs
import os
import sys
from copy import deepcopy
from libs.evaluate import merge_offsets
from libs.supervised.prep.prepare import Prep
from constants import get_path, join_path
from libs.supervised.classifiers.svm_rank import Supervised
from util.common import hash_obj
from util.cache import simple_caching, object_hashing
from importlib import import_module
import constants

path = get_path()
STOPWORDS_PATH = path['data'] + '/stopwords.txt'
CLF_PATH = join_path(path['root'], 'libs/supervised/classifiers')
docs_path = join_path(path['data'], 'TAC_2014_BiomedSumm_Training_Data')
json_data_path = join_path(path['data'], 'v1-2a.json')

# root_proj_path = os.getcwd()
# while not('.git' in os.listdir(root_proj_path)):
#     root_proj_path = os.path.split(root_proj_path)[0]
# if not(root_proj_path in sys.path):
#     sys.path.append(root_proj_path)


class Reranker(RerankInterface):

    reranker_opts = {
예제 #16
0
from random import randint
from copy import deepcopy
import itertools
from log_conf import Logger
from summarizer.mmr_summarizer import MMR

from util.aritmatic_operations import mean_conf
from util.tokenization import WordTokenizer
from util.common import write_json_as_csv, hash_obj, hash_dict
import gzip

w_t = WordTokenizer(stem=False)

logger = Logger(__file__.split('/')[-1]).logger

path = constants.get_path()
result_outpath = 'tmp/tmpres/'

_ANNS_DIR = path['ann']
_ANNS_PATH = path['ann_json']
CACHE = path['cache']

valid_topics = ['all']
# doc_mod = DocumentsModel(_ANNS_DIR)

CACHE_FILE = constants.join_path(
    CACHE, 'umls.json')
if os.path.isfile(CACHE_FILE):
    try:
        with codecs.open(CACHE_FILE, 'rb', 'utf-8') as mf:
            cachefile = json.load(mf)
예제 #17
0
import re
import os
import sys
from functools import wraps
from constants import get_path
from pprint import pformat

from nltk.tokenize.regexp import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import codecs
import math
stemmer = PorterStemmer()
lmtzr = WordNetLemmatizer()
tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
STOPWORDS = get_path()['data'] + '/stopwords.txt'
with file(STOPWORDS) as f:
    stopwords = frozenset([l.strip().lower() for l in f])

from optparse import OptionParser

from time import time as now

try:
    import matplotlib.pyplot as plt
except ImportError:
    pass
try:
    import numpy as np
except ImportError:
    pass
예제 #18
0
def get_data(docs_path=constants.get_path()['ann'],
             json_data_path=constants.get_path()['ann_json']):
    '''
    Populates the docs_new object which stores information
        about the topics
        format of the docs_new:
        [ <list>(dict) keys (topic_id): 'd1418_train', ...:
            {'d1418_train' : [ <list>(dict) keys (citance_number): u'1', u'2', ...
                {u'11': [ <list>(dict) keys (annotator_id): 'I', 'B',...
                    {u'I': <dict>, keys:'ref_art',
                                    'not_relevant',
                                    'cit_offset',
                                    'cit_art',
                                    'ref_offset',
                                    'cit_text',
                                    'ref_text' }
    Args:
        docs_path(str): Path to the training data directory
            e.g. data/TAC_2014_BiomedSumm_Training_Data

        json_data_path(str): Path to the json training file (v1-2a.json)

    Returns:
        dict with the above format
    '''
    doc_mod = DocumentsModel(docs_path)
    docs = doc_mod.get_all()
    with codecs.open(json_data_path, 'rb', 'utf-8') as mf:
        data = json.load(mf)
    docs_new = {}
    #     print docs.keys()
    #     print docs.values()[0].keys()

    for tid, annotations in data.iteritems():
        if tid not in docs_new:
            docs_new[tid] = {}
        for annotator_id, ann_list in annotations.iteritems():
            for ann in ann_list:
                cit = ann['citance_number']
                if cit not in docs_new[tid]:
                    docs_new[tid][cit] = {}
                docs_new[tid][cit][annotator_id] = {}
                if 'ref_offset' not in docs_new[tid][cit][annotator_id]:
                    docs_new[tid][cit][annotator_id]['ref_offset'] =\
                        ann['reference_offset']
                else:
                    docs_new[tid][cit][annotator_id]['ref_offset'] = union(
                        docs_new[tid][cit][annotator_id]['ref_offset'] +
                        ann['reference_offset'])
                if 'cit_offset' not in docs_new[tid][cit][annotator_id]:
                    docs_new[tid][cit][annotator_id]['cit_offset'] =\
                        [ann['citation_offset']]
                else:
                    docs_new[tid][cit][annotator_id]['cit_offset'] = union(
                        docs_new[tid][cit][annotator_id]['cit_offset'] +
                        [ann['citation_offset']])
                docs_new[tid][cit][annotator_id]['ref_art'] = ann[
                    'reference_article']
                docs_new[tid][cit][annotator_id]['cit_art'] = ann[
                    'citing_article']

    for tid in docs_new:
        for cit in docs_new[tid]:
            for ann in docs_new[tid][cit]:
                docs_new[tid][cit][ann]['ref_text'] =\
                    [(s, doc_mod.get_doc(tid,
                                         docs_new[tid][cit][ann][
                                             'ref_art'].lower(),
                                         interval=s)) for s in
                     docs_new[tid][cit][ann]['ref_offset']]
                cit_off = union(docs_new[tid][cit][ann]['cit_offset'])
                docs_new[tid][cit][ann]['cit_text'] =\
                    ' '.join([doc_mod.get_doc(tid, docs_new[tid][cit][ann][
                        'cit_art'].lower(), intrvl) for
                        intrvl in cit_off])

    return docs_new
예제 #19
0
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from heapq import heappush, heappushpop
try:
    import cPickle as pickle
except:
    import pickle
import os
import re
from util.tokenization import WordTokenizer, SentTokenizer
from nltk.stem.porter import PorterStemmer
from util.common import (hash_obj, VerbosePrinter, flatten)
from constants import get_path

cache_DIR = get_path()['cache']


class Summarizer(object):
    '''
    Base class for summarizers
    '''
    method_opts = {}

    def __init__(self, args=None, opts=None):
        """ Initialize the Summarizer.
            args is a list of arguments for the Summarizer (typically
            from input evaluate.py.
            opts is a ArgumentParser or OptionParser object.

            Notes
예제 #20
0
from rerank.null import Reranker as RerankInterface
import json
import codecs
import os
import sys
from libs.evaluate import merge_offsets
from libs.supervised.prep.prepare import Prep
from constants import get_path, join_path
from libs.supervised.classifiers.svm_rank import Supervised
from util.common import hash_obj
from util.cache import simple_caching, object_hashing
from importlib import import_module
import constants
import operator

path = get_path()
STOPWORDS_PATH = path['data'] + '/stopwords.txt'
CLF_PATH = join_path(
    path['root'], 'libs/supervised/classifiers')
docs_path = join_path(path['data'], 'TAC_2014_BiomedSumm_Training_Data')
json_data_path = join_path(path['data'], 'v1-2a.json')

# root_proj_path = os.getcwd()
# while not('.git' in os.listdir(root_proj_path)):
#     root_proj_path = os.path.split(root_proj_path)[0]
# if not(root_proj_path in sys.path):
#     sys.path.append(root_proj_path)


class Reranker(RerankInterface):