def _analyze(recording): # We pass a tuple because SQLAlchemy objects are not fully process-safe. # They can be pickled and unpickled, but they won't be bound to a Session # anymore. This means they can't refresh their attributes, which is # something they try to do after a commit() from inside the main process. recording_id, sonogram_url_small = recording try: # Create one fetcher per process. global _sonogram_fetcher # pylint: disable=global-statement if not _sonogram_fetcher: _sonogram_fetcher = fetcher.Fetcher(cache_group='xc_sonograms_small', pool_size=1) sonogram = None try: sonogram = _sonogram_fetcher.fetch_cached(sonogram_url_small) except fetcher.FetchError as ex: logging.warning(f'Sonogram for recording {recording_id} could not be fetched', exc_info=True) sonogram_quality = -999999 if sonogram: sonogram_quality = analysis.sonogram_quality(recording_id, sonogram) return (recording_id, sonogram_quality) except Exception as ex: # Re-raise as something that's guaranteed to be pickleable. logging.error('Exception during analysis', exc_info=True) raise RuntimeError(f'Exception during analysis: {ex}')
def _process_image(image): ''' Entry point for parallel processing. ''' global _fetcher # pylint: disable=global-statement if not _fetcher: _fetcher = fetcher.Fetcher('wp_images', pool_size=1) full_output_file_name = os.path.join(_args.image_output_dir, image.output_file_name) if os.path.exists(full_output_file_name) and not _args.recreate_images: return image.output_file_name image_data = _fetcher.fetch_cached(image.image_file_url) pil_image = PIL.Image.open(io.BytesIO(image_data)) if pil_image.width > _args.image_size or pil_image.height > _args.image_size: if pil_image.width >= pil_image.height: output_width = _args.image_size output_height = round(output_width / pil_image.width * pil_image.height) else: output_height = _args.image_size output_width = round(output_height / pil_image.height * pil_image.width) pil_image = pil_image.resize((output_width, output_height), resample=PIL.Image.LANCZOS) pil_image.save(full_output_file_name, format='WebP', quality=_args.image_quality) return image.output_file_name
def __init__(self, weekfile, weeknumber): self.weekreview = fetcher.Fetcher(weekfile, end_ind="tag") self.weekreview.read_file() self.weektable = tablemd.tablemd(self.weekreview.content_data) weektitle = [self.weektable.readcell(0,2*i+1).encode('ascii','ignore') for i in range(3)] weeknumbers = [int(filter(str.isdigit,strt)) for strt in weektitle] self.week_index = 1 + weeknumbers.index(weeknumber) * 2
def FetchAsyncWithAuth(*args, **kwargs): credential = model.GetOAuth2Credential('github') if credential: url_auth_suffix = ('?client_id={0}&client_secret={1}' .format(credential.client_id, credential.client_secret)) else: url_auth_suffix = '' return fetcher.Fetcher(*args, url_auth_suffix=url_auth_suffix, **kwargs)
def PopulateRepos(self): shared.EnsureRunningInTask() # gives us automatic retries baseurl = self.repo_collection.key.id() fetched = fetcher.Fetcher(baseurl, follow_redirects=True) page = fetched.content candidate_repos = self._GetChildPaths(page) fetches = [] # we found a project in the root directory if 'app.yaml' in candidate_repos: candidate_repos.insert(0, '') if common.IsDevMode(): # fetch fewer repos during development candidate_repos = candidate_repos[:1] for c in candidate_repos: if c and not c.endswith('/'): continue project_url = '{0}{1}'.format(baseurl, c) app_yaml_url = '{0}app.yaml'.format(project_url) fetched = fetcher.Fetcher(app_yaml_url, follow_redirects=True) fetches.append((c, project_url, app_yaml_url, fetched)) repos = [] for c, project_url, app_yaml_url, fetched in fetches: try: content = fetched.content shared.i('found app.yaml: {}'.format(app_yaml_url)) name = c.rstrip('/') or project_url description = 'Sample code from {0}'.format(project_url) model.CreateRepoAsync(repo_url=project_url, html_url=project_url, name=name, description=description, open_files=[]) except urlfetch_errors.Error: exc_info = sys.exc_info() formatted_exception = traceback.format_exception( exc_info[0], exc_info[1], exc_info[2]) shared.w('skipping {0}'.format(project_url)) for line in [line for line in formatted_exception if line]: shared.w(line)
async def main(delay_rate, server_upper_limit): server_upper_limit = asyncio.Semaphore(server_upper_limit) obj = fetcher.Fetcher(delay_rate=delay_rate, server_upper_limit=server_upper_limit) resultDataSet = await obj.main() resultDataSet = {"database": resultDataSet} with open('./data.json', 'w') as json_file: json_file.write(json.dumps(resultDataSet)) print("The entire data has been stored into the data.json file") time.sleep(0.1)
def crawl(self, config): print 'start crawling' #实例化多线程抓取模块,指定3个线程 f = fetcher.Fetcher(config) f.start() while f.get_running_count() > 0: print '------------ running threads : %s ------------' % f.get_running_count( ) time.sleep(5) f.printFinishLog()
def __init__(self): self.working = True self.threads = [] self.log = log.Log(self) self.fetcher = fetcher.Fetcher(self) self.process = process.Process(self) self.db = db.Db(self) self.fetcher.start() self.process.start() self.db.start()
def test_login_failure(self): cookieFile = tempfile.mkstemp()[1] try: os.remove(cookieFile) get_HTML = fetcher.Fetcher(lambda x: sys.stdout.write(x + '\n'), lambda x: cookieFile).getHTML user = User(get_HTML) userID, real_name = user.login(self.username, self.password + "not") self.assertIsNone(userID) self.assertIsNone(real_name) finally: os.remove(cookieFile)
def test_login_success(self): cookieFile = tempfile.mkstemp()[1] try: os.remove(cookieFile) get_HTML = fetcher.Fetcher(lambda x: sys.stdout.write(x + '\n'), lambda x: cookieFile).getHTML user = User(get_HTML) # Weak assertions but don't want to tie to a particular user. userID, real_name = user.login(self.username, self.password) # Weak assertions but don't want to tie to a particular user. self.assertIsNotNone(userID) self.assertIsNotNone(real_name) finally: os.remove(cookieFile)
def add_files(dirname): url = os.path.join(repo_url, dirname) fetched = fetcher.Fetcher(url, follow_redirects=True) page = fetched.content paths = self._GetChildPaths(page) shared.i('{0} -> {1}', url, paths) if not paths: shared.i('- {0}'.format(dirname)) tree.SetFile(dirname, page) for path in paths: if common.GetExtension(path) in settings.SKIP_EXTENSIONS: continue relpath = os.path.join(dirname, path) add_files(relpath)
def get_wiktionary_fields(entry, language): """ Fetch Wiktionary info about the work and update fields """ myFetcher = fetcher.Fetcher() fetchedPage = myFetcher.fetch(entry) if fetchedPage: myParser = parser.Parser() myParser.setLanguage(language) myParser.setWordName(entry) extractedWord = myParser.extractData(fetchedPage) return extractedWord else: return False
def scan(settings: Settings): logger.warning("Scan houses from web site") storage: fetcher.HouseStore = store.LiteHouseStore( settings.storage["sqlite"]["path"]) notifier: fetcher.Notifier if settings.core["should_notify"]: notifier = notify.IftttNotifier( logger, fetcher.get_591_url, settings.notify["ifttt"]["webhook_token"], settings.notify["ifttt"]["event_name"], ) else: notifier = notify.EmptyNotifier() agent = fetcher.Fetcher(logger, settings.core["page_delay"]) scanner = fetcher.Scanner(logger, storage, notifier, agent, settings.core["batch_delay"]) scanner.scan_house(settings.query)
def files(): return fetcher.Fetcher(datadir).fetch_files( 'benchdens', 'e0(2|4|6|8)n((100))\\.00(0|1|2)\\.in')
def fileshighhk90(): return fetcher.Fetcher(datadir).fetch_files( 'holmefinal', 'n90d(6|8)0p(\\d\\d)\\.00(0|1|2|3|4)\\.in')
def filesn90d60fixedpart(): return fetcher.Fetcher(datadir).fetch_files( 'fixedpart', 'p(\d)n90d0(6)\\.00(0|1)\\.in')
def fileshigh90(): return fetcher.Fetcher(datadir).fetch_files( 'benchdens', 'e0(6|8)n((90))\\.00(0|1|2|3|4)\\.in')
def fileshigh(): return fetcher.Fetcher(datadir).fetch_files( 'benchdens', 'e0(6|8)n((100))\\.00(3|4)\\.in')
# # Fetch full images from the library for a random subset of records. from optparse import OptionParser import fetcher import random import record if __name__ == '__main__': parser = OptionParser() parser.add_option("-n", "--num", dest="num", default=100, help="How many images to fetch.") parser.add_option("", "--seed", dest="seed", default=12345, help="Random number seed.") parser.add_option("-c", "--output_dir", dest="cache_dir", default="images", help="Images destination dir") parser.add_option("-s", "--secs", dest="secs", default=5, help="Number of seconds to wait between fetches.") (options, args) = parser.parse_args() rs = record.AllRecords() rand = random.Random(options.seed) rand.shuffle(rs) f = fetcher.Fetcher(options.cache_dir, int(options.secs)) for i, r in enumerate(rs[0:int(options.num)]): print "%03d Fetching %s" % (i, r.photo_url) f.Fetch(r.photo_url)
def __init__(self): self.database = database.Database() self.fetcher = fetcher.Fetcher() current_version = self.fetcher.version() if current_version > self.database.version(): self.update_database(str(current_version))
def filesp1(): return fetcher.Fetcher(datadir).fetch_files( 'singlepart', 'e0(2|4|6|8)n((100))p1\\.00(0|1|2)\\.in')
# Example for Fetcher.cpp import cv2 import fetcher myFetcher = fetcher.Fetcher() ptNumber = 0 def onMouse(event, x, y, flags, param): global ptNumber if event == cv2.EVENT_LBUTTONUP: myFetcher.setPt(x, y, ptNumber) ptNumber = ptNumber + 1 cv2.namedWindow("Video Captured") cv2.setMouseCallback("Video Captured", onMouse) cap = cv2.VideoCapture(0) while (True): # Capture frame-by-frame ret, frame = cap.read() key = cv2.waitKey(1) # Our operations on the frame come here fileName = myFetcher.calibrate(frame, 5, 200, key)
#!/usr/bin/python import record import fetcher import os rs = record.AllRecords() f = fetcher.Fetcher('images', 0) rs = [r for r in rs if (r.photo_url and f.InCache(r.photo_url))] for idx, r in enumerate(rs): in_image = f.CacheFile(r.photo_url) out_image = 'thumbnails/%s.jpg' % r.photo_id() cmd = 'convert %s -resize 200x200 %s' % (in_image, out_image) print '%05d %s' % (idx, cmd) os.system(cmd)
def fileshk(): return fetcher.Fetcher(datadir).fetch_files( 'holme', 'n100d0(1|2|3|4)\\.00(3|4)\\.in')
def __init__(self, cap, key=1): self.cap = cap self.key = key self.patch = None self.myFetcher = fetcher.Fetcher() signal.signal(signal.SIGTSTP, self.handler)
def filesvlow(): return fetcher.Fetcher(datadir).fetch_files( 'benchdens', 'e0(2)n((100))\\.00(3|4)\\.in')
consumer_key = os.getenv("CONSUMER_KEY", None) consumer_secret = os.getenv("CONSUMER_SECRET", None) access_token = os.getenv("ACCESS_TOKEN", None) access_token_secret = os.getenv("ACCESS_TOKEN_SECRET", None) if ( consumer_key is None or consumer_secret is None or access_token is None or access_token_secret is None ): print("No credentials provided") sys.exit(1) fetcher = fetcher.Fetcher( consumer_key, consumer_secret, access_token, access_token_secret ) app = flask.Flask(__name__) @app.route("/", methods=["GET", "POST"]) def handler(): if flask.request.method == "GET": return {"status": "ok"}, 200 else: if flask.request.json is None: app.logger.error("POST req without req body received") return {"status": "error"}, 500 else: try: ids = flask.request.json["ids"]
#Parse the command line arguments, and load the config file. args = get_args() config = configreader.Configurator(args.settings) cfg, notices = config.read_settings('spatial') #cfg, notices = configreader.read_settings(args.settings) today = datetime.date.today() if args.weekly: searchdate = today + datetime.timedelta(days=-7) dbname = 'w_' + cfg['localdb'] else: searchdate = today dbname = cfg['localdb'] #test date please ignore. #searchdate = datetime.date(2014, 11, 8) print 'Fetching from {0} to now...'.format(searchdate.isoformat()) lamagetter = fetcher.Fetcher(cfg['connection']) localdb = LocalStore(dbname) for notice in notices: try: rows = lamagetter.fetch(notice['sql'].format(searchdate)) except fetcher.FetcherError as e: print ' - Error accessing DB: {0}'.format(e.message) sys.exit(1) localdb.save_data(notice['table'], rows, notice['uidfield']) print ' - {0} rows saved to table {1}'.format(len(rows), notice['table']) localdb.close_db() print 'NoticeLAMA Complete!'
def trim_recording(recording, skip_if_exists=True, skip_write=False, debug_otsu_threshold=False, debug_utterances=False): ''' Trims the given recording and stores it to a file. Returns the file name, or None if this recording is permanently untrimmable for some reason. ''' global _fetcher # pylint: disable=global-statement if not _fetcher: _fetcher = fetcher.Fetcher('recordings', pool_size=1) output_file_name = trimmed_recording_file_name(recording) if skip_if_exists and os.path.exists(output_file_name): return output_file_name try: data = _fetcher.fetch_cached(recording.audio_url) except fetcher.FetchError as ex: logging.error(f'Error fetching {recording.recording_id}: {ex}') return None try: sound = pydub.AudioSegment.from_file(io.BytesIO(data), 'mp3') except Exception as ex: # pylint: disable=broad-except # These errors can get extremely long. logging.error( f'Failed to decode audio file for {recording.url} ' f'(cache file {_fetcher.cache_file_name(recording.audio_url)}): {str(ex)[:5000]}' ) return None # pydub does everything in milliseconds, and so do we, unless otherwise # specified. sound = sound[:1000 * _AUDIO_SCAN_DURATION] sound = sound.set_channels(1) sound = sound.set_frame_rate(_AUDIO_SAMPLE_RATE) min_duration = round(1000 * _MIN_AUDIO_DURATION) max_duration = round(1000 * _MAX_AUDIO_DURATION) padding_duration = round(1000 * _AUDIO_PADDING_DURATION) fade_duration = round(1000 * _AUDIO_FADE_DURATION) # Find longest utterance, the end of which is a good place to cut off the # sample. utterances = list( _detect_utterances(sound, debug_otsu_threshold=debug_otsu_threshold)) # This should not happen, because the threshold is such that there is # always something above it. assert utterances, f'No utterances detected in {recording.url}' # Exhaustively search all possible ranges of consecutive utterances that we # want to include, and score them by desirability. candidates = [] # We try to start only from the first three utterances, because recordists # tend to trim the audio such that it starts on a relevant bit. This seems # to help to avoid including (unlabelled) background species and other # noise. for i, start_utterance in enumerate(utterances[:3]): start_ms = max(0, start_utterance[0] - padding_duration) utterance_duration = 0 for end_utterance in utterances[i:]: utterance_duration += end_utterance[1] - end_utterance[0] end_ms = min(len(sound), end_utterance[1] + padding_duration) total_duration = end_ms - start_ms # First criterion: it must be long enough. More negative is more bad. longness_score = min(0.0, total_duration - min_duration) # Second criterion: it must not be too long. More negative is more bad. shortness_score = min(0.0, max_duration - total_duration) # Third criterion: it must have a good utterance to silence ratio. utterance_score = utterance_duration / total_duration score_vector = (longness_score, shortness_score, utterance_score) candidates.append((score_vector, (start_ms, end_ms))) _, (start_ms, end_ms) = max(candidates) duration_ms = end_ms - start_ms # Never go above the maximum duration. if duration_ms > max_duration: end_ms = start_ms + max_duration # Never go below the minimum duration. if duration_ms < min_duration: # Try adding half of the missing duration before and half after. margin_ms = (min_duration - duration_ms + 1) // 2 start_ms -= margin_ms end_ms += margin_ms if start_ms < 0: # Running up to the start of the sound. start_ms = 0 end_ms = min(len(sound), start_ms + min_duration) if end_ms > len(sound): # Running up to the end of the sound. end_ms = len(sound) start_ms = max(0, end_ms - min_duration) sound = sound[start_ms:end_ms] sound = sound.fade_in(fade_duration).fade_out(fade_duration) sound = pydub.effects.normalize(sound) if debug_utterances: import subprocess # pylint: disable=import-outside-toplevel import tempfile # pylint: disable=import-outside-toplevel from PIL import Image, ImageDraw # pylint: disable=import-outside-toplevel sonogram_data = _fetcher.fetch_cached(recording.sonogram_url_full) sonogram = Image.open(io.BytesIO(sonogram_data)) draw = ImageDraw.Draw(sonogram, mode='RGBA') def highlight(start_ms, end_ms, color): # Fixed parameters for full sonograms drawn by xeno-canto. # Visual left margin is at 62px, but it seems the audio starts # 4px later. margin_left = 66 px_per_ms = 75 / 1000 left_px = margin_left + px_per_ms * start_ms right_px = margin_left + px_per_ms * end_ms draw.rectangle(((left_px, 0), (right_px, sonogram.height)), fill=color) highlight(start_ms, end_ms, (128, 128, 255, 32)) for (s, e) in utterances: highlight(s, e, (128, 255, 128, 64)) with tempfile.NamedTemporaryFile() as f: sonogram.save(f, format='png') subprocess.run(['eog', f.name], check=False) if skip_write: return None tmp_file_name = output_file_name + '.tmp' sound.export(tmp_file_name, format='ogg', parameters=['-q:a', str(_AUDIO_QUALITY)]) os.rename(tmp_file_name, output_file_name) return output_file_name
#!/usr/bin/env python3 """Download all the images referenced from an images.ndjson file. Usage: ./fetch_images.py images.ndjson """ import fileinput import json import os import requests import fetcher if __name__ == '__main__': f = fetcher.Fetcher() os.makedirs('images', exist_ok=True) for i, line in enumerate(fileinput.input()): image = json.loads(line) url = image.get('imageLink') if not url: continue path = os.path.join('images', os.path.basename(url)) if os.path.exists(path): continue try: content = f.fetch_url(url) except requests.exceptions.HTTPError as e: if e.response.status_code == 404: continue # sadly, some images are just missing