def __init__(self, url: str, method: str = 'GET', *, callback=None, load_js: bool = False, metadata: dict = None, headers: dict = None, request_config: dict = None, request_session=None, res_type: str = 'text', **kwargs): """ Initialization parameters """ self.url = url self.method = method.upper() if self.method not in self.METHOD: raise ValueError('%s method is not supported' % self.method) self.callback = callback self.load_js = load_js self.headers = headers self.metadata = metadata if metadata is not None else {} self.request_session = request_session if request_config is None: self.request_config = self.REQUEST_CONFIG else: self.request_config = request_config self.res_type = res_type self.kwargs = kwargs self.close_request_session = False self.logger = get_logger(name=self.name) self.retry_times = self.request_config.get('RETRIES', 3)
def test_logfile_created(self): from tempfile import mkdtemp from core.utils import get_logger from os.path import isfile filename = "%s/sourcerer.log" % mkdtemp() logger = get_logger('test', filename) if not isfile(filename): self.fail("Log file %s not created by get_logger()" % filename)
def test_log_filename_not_writeable(self): from core.utils import get_logger filename = "%s/sourcerer.log" % self._get_nonexistant_directory_name() try: logger = get_logger('test', filename) except IOError: # We should catch the IO error in get_logger() self.fail("IOError exception not raised for nonexistant logfile directory.")
def __init__(self, middleware, loop=None): if not self.start_urls or not isinstance(self.start_urls, list): raise ValueError( "Spider must have a param named start_urls, eg: start_urls = ['https://www.github.com']" ) self.logger = get_logger(name=self.name) self.loop = loop or asyncio.new_event_loop() asyncio.set_event_loop(self.loop) self.request_queue = asyncio.Queue() self.sem = asyncio.Semaphore(getattr(self, 'concurrency', 3)) self.middleware = middleware or Middleware()
def __init__(self, bucket, resource='s3'): self.bucket = bucket self.app_logger = get_logger('app') if 'session' not in self.session or self.session[ 'expire'] < datetime.utcnow(): self.session.update({ 'session': boto3.Session( aws_access_key_id=configs.AWS_ACCESS_KEY_ID, aws_secret_access_key=configs.AWS_SECRET_ACCESS_KEY), 'expire': datetime.utcnow() + timedelta(hours=1) }) if resource not in self.clients: self.clients.update( {resource: self.session['session'].client(resource)})
def test_log_error(self): from core.utils import get_logger from os.path import isfile from tempfile import mkdtemp filename = "%s/sourcerer.log" % mkdtemp() logger = get_logger('test', filename) error_msg = "This is an error." logger.error(error_msg) f = open(filename) line = f.readline() # The most recent log entry in the file should contain our error message self.assertNotEqual(line.find(error_msg), -1)
def evaluate_line(input_str): config = load_config(file_path + "/" + FLAGS.config_file) logger = get_logger(FLAGS.log_file) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(file_path + "/" + FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, file_path + "/" + FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) # for report in input_str: # result = model.evaluate_line(sess, input_from_line(report, char_to_id), id_to_tag) # print(result) report = input_str result = model.evaluate_line(sess, input_from_line(report, char_to_id), id_to_tag) print(result) return result
import sys import os import time import argparse from core.collection import DataCollector from core.utils import get_logger logger = get_logger('collect', dest=['console', 'file']) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Collect data') parser.add_argument('-s', '--simulate', action='store', dest='simulate', default=None, help='''Simulate data collection''') parser.add_argument('-i', '--interval', action='store', dest='interval', default='2', help='''Interval between scans, in seconds. Only active if simulate is True''') parser.add_argument('-d', '--directory', action='store', dest='directory', default='tmp', help='Directory to watch') parser.add_argument('-p', '--parent', action='store_true', dest='parent', default=False, help='Monitor the provided directory for the first new folder, then monitor that folder for new files')
from core.engine import train, evaluate, LargerHolder from core.metric import AverageMetric, AccuracyMetric from core.model import cifar_resnet20 from core.loss import CACLoss from core.utils import get_args, get_logger from core.utils import set_cudnn_auto_tune from core.utils import FLOPs from core.utils import replace_convs_with_cac if __name__ == "__main__": args = get_args() hocon = pyhocon.ConfigFactory.parse_file(args.config) output_directory = args.output_directory os.makedirs(output_directory, exist_ok=False) logger = get_logger("train", output_directory) set_cudnn_auto_tune() device = "cuda:0" if torch.cuda.is_available() else "cpu" train_transform = torchvision.transforms.Compose([ torchvision.transforms.RandomCrop(size=32, padding=4), torchvision.transforms.RandomHorizontalFlip(), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize( mean=hocon.get_list("dataset.mean"), std=hocon.get_list("dataset.std"), ), ]) val_transform = torchvision.transforms.Compose([
import requests from bs4 import BeautifulSoup as bs import traceback from core.db import insert from core.utils import ( parse_time, headers, logger_time, get_logger, ) SRC_URL = 'http://openinsider.com/screener?s={}&o=&pl=&ph=&ll=&lh=&fd=0&fdr=&td=0&tdr=&fdlyl=&fdlyh=&daysago=&xp=1&xs=1&vl=&vh=&ocl=&och=&sic1=-1&sicl=100&sich=9999&grp=0&nfl=&nfh=&nil=&nih=&nol=&noh=&v2l=&v2h=&oc2l=&oc2h=&sortcol=0&cnt=10000000&page=1' logger = get_logger('openinsider-service') def get_request(insider_url, ticker): logger.info(f'Get request with tickername: {ticker}') session = requests.Session() data = session.get(insider_url.format(ticker), headers=headers, stream=True) return data.text @logger_time def get_data(html): soup = bs(html, 'lxml') rows = soup.find('table', class_='tinytable').find('tbody').find_all('tr') for i in rows:
from typing import Dict, List, Tuple, AnyStr from datetime import datetime import requests from pytz import timezone from core.utils import ( parse_time, headers, get_logger, url, logger_time, ) from core.db import insert logger = get_logger('pulse-service') def get_cursor_number(url: str, ticker: str, cursor='9999999') -> Dict: session = requests.Session() logger.info(f'Get cursor number from {url.format(ticker, cursor)}') data = session.get(url.format(ticker, cursor), headers=headers, stream=True) logger.info( f"Prev cursor number is {data.json()['payload']['nextCursor']}") return data.json()['payload']['nextCursor'] def get_data_from_api(url: str, ticker: str, cursor: str) -> None: session = requests.Session()
import os import time import argparse from core.preprocessing import Preprocessor from core.utils import get_logger logger = get_logger("preprocess", dest=["console", "file"]) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Preprocess data") parser.add_argument("config", action="store", help="Name of configuration file") args = parser.parse_args() logger.info("Loading preprocessing pipeline from %s" % args.config) preproc = Preprocessor(args.config) preproc.run()
def train(): # 加载数据集 train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # 选择tag形式 (IOB / IOBES) 默认使用IOBES update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: # {'S-LOC': 10, 'E-LOC': 3, 'B-ORG': 4, 'S-PER': 11, 'S-ORG': 12, 'O': 0, # 'E-ORG': 5, 'I-LOC': 6, 'I-PER': 7, 'I-ORG': 1, 'B-PER': 8, 'B-LOC': 2, 'E-PER': 9} char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # 转化成数字化的数据 train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) # 长度不足补0 train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # GPU设置 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) # 每100次算一次平均loss loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
#!/usr/bin/env python import os import time import argparse from core.stimulation import Stimulator from core.utils import get_logger logger = get_logger('stimulate', dest=['console', 'file']) if __name__=='__main__': parser = argparse.ArgumentParser(description='Preprocess data') parser.add_argument('config', action='store', nargs='?', default='stim-01', help='Name of configuration file') args = parser.parse_args() stim = Stimulator(args.config) stim.run() # this will start an infinite run loop
from django import forms from django.forms.widgets import Select, HiddenInput from core.models import Comment,CommentType,Topic from tagger.models import Tag from users.models import UserProfile from django.forms.widgets import CheckboxSelectMultiple from core import utils logger = utils.get_logger(__name__) class CommentDeleteForm(forms.Form): allcomments = Comment.objects.filter(is_deleted=False).filter(is_parent=True) comments = forms.ModelMultipleChoiceField(allcomments, ) class CommentTopicForm(forms.Form): allcomments = Comment.objects.filter(is_deleted=False).filter(is_parent=True) alltopics = Topic.objects.filter(is_deleted=False) comment = forms.ModelChoiceField(allcomments, empty_label = None) topic = forms.ModelChoiceField(alltopics, empty_label = None) class TopicDeleteForm(forms.Form): alltopics = Topic.objects.filter(is_deleted=False) topics = forms.ModelMultipleChoiceField(alltopics, ) class NewSummaryForm(forms.Form): """ Form to let a user create a new summary for a topic""" alltopics = Topic.objects.filter(is_deleted=False)