def run_ica(): log('loading data') start = util.now() voxels, xdim, ydim, zdim = load_data() log(' elapsed: {}'.format(util.elapsed(start))) log('running independent component analysis') start = util.now() ica = decomposition.FastICA(n_components=64, max_iter=200) sources = ica.fit_transform(voxels) sources = to_dataframe(sources, load_subject_ids(), ['X{}'.format(i) for i in range(64)]) log(' elapsed: {}'.format(util.elapsed(start))) log('calculating correlations between voxel and component time courses') start = util.now() correlations = [] for voxel in voxels.columns[:32]: voxel = voxels[voxel] max_correlation = 0 for source in sources.columns: source = sources[source] correlation = np.corrcoef(voxel, source) if correlation > max_correlation: max_correlation = correlation correlations.append(max_correlation) log(' elapsed: {}'.format(util.elapsed(start)))
def init_gap_junctions(): # initialize source for mgid in range(params.Nmitral): mpriden = split.mpriden(mgid) if mpriden: mpriden.push() pc.source_var(mpriden(0.99)._ref_v, mgid) h.pop_section() pc.barrier() # initialize targets for mgid in range(params.Nmitral): mpriden = split.mpriden(mgid) if mpriden: glomid = mgid/nmxg for sistermgid in range(glomid * nmxg, mgid)+range(mgid+1, (glomid+1)*nmxg): if pc.gid_exists(sistermgid) > 0: gap = h.Gap(mpriden(0.99)) if sistermgid != 189: getmodel().gj[(mgid, sistermgid)] = gap glomid = mgid / nmxg pc.target_var(gap, gap._ref_vgap, sistermgid) util.elapsed('Gap junctions builden')
def build_granules(model): '''build granules''' model.granules = {} for gid in model.granule_gids: g = mkgranule(gid) model.granules[gid] = g elapsed('%d granules built' % int(pc.allreduce(len(model.granules), 1)))
def do_a_loop(first=None, last=None, url=None, threads=0, chunk_size=None): es = set_up_elastic(url) loop_start = time() results = es.search(index=INDEX_NAME, body=query, request_timeout=10000) # print u"search body:\n{}".format(query) print u"took {}s to search ES. remaining: {:,}".format( elapsed(loop_start, 2), results["hits"]["total"]) records_to_save = [] # decide if should stop looping after this if not results['hits']['hits']: sys.exit() crossref_results = [] for crossref_hit in results['hits']['hits']: crossref_hit_doc = crossref_hit["_source"] crossref_results.append( CrossrefResult(crossref_hit["_id"], crossref_hit_doc)) for crossref_result in crossref_results: records_to_save.append(crossref_result.make_action_record()) # print "records_to_save", records_to_save print "starting saving" save_records_in_es(es, records_to_save, threads, chunk_size) print "** {}s to do {}\n".format(elapsed(loop_start, 2), len(crossref_results))
def ingest(self): debug_print("Ingesting directory {}".format(self.directory)) debug_print("Ingesting the files \n{}".format(self.files)) is_lambda = self.context[c.KEY_LAMBDA_FUNCTION] is not None timeout = self.__calculate_aggregate_window_timeout(self.context[c.KEY_MAX_LAMBDA_TIME]) target_excretion_size = self.context[c.KEY_TARGET_AGGREGATION_FILE_SIZE_IN_MB] compression_ratio = self.context[c.KEY_CSV_PARQUET_COMPRESSION_RATIO] sep = self.context[c.KEY_SEPERATOR_PARTITION] memory_trigger = self.context[c.KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] memory_used = mutil.get_memory_usage() main_filename, main_file_data, main_file_size_mb = self.__get_main_aggregate_file(self.directory, sep, target_excretion_size) main_file_data = self.__append(None, main_file_data) keys_ingested = [] for file in self.files: debug_print("\tProcessing file {}".format(file)) key_parts = KeyParts(file, sep) duration = datetime.datetime.utcnow() - key_parts.filename_timestamp if duration.total_seconds() < 300: debug_print("The file '{}' is {}s old. It is too new and will be processed later to allow for S3 propagation.".format(file, duration.total_seconds())) continue keys_ingested.append(file) data = self.__open(file, main_file_data) if data is None: continue size_in_megabytes = self.__size(file) main_file_data = self.__append(main_file_data, data) del data gc.collect() current_dataframe_size = sys.getsizeof(main_file_data) #break conditions #1. Memory limit exceeded #2. Time window exceeded #3. Target excretion size hit main_file_size_mb += size_in_megabytes memory_used = mutil.get_memory_usage() debug_print("\t\tSize on S3: {}MB Size of new dataset: {}bytes Estimated Compression Ratio: {} Memory Used: {}% Project Compression Size {}MB Target Excretion Size {}MB".format(size_in_megabytes, current_dataframe_size, compression_ratio, memory_used, main_file_size_mb, target_excretion_size)) if util.elapsed(self.context) > timeout or memory_used > memory_trigger or main_file_size_mb > target_excretion_size : print "Elapsed", util.elapsed(self.context), "Start:", self.starttime, "Timeout:", timeout, "Has timed out:", util.elapsed(self.context) > timeout, "Mem Used %:", memory_used, "Max Memory %:", memory_trigger break #only save the files if we have a reasonable amount of time remaining before the lambda timeout. debug_print("Time remaining: {}s".format(util.time_remaining(self.context))) debug_print("There were {} keys ingested. The keys ingested are: \n {}".format(len(keys_ingested), keys_ingested)) if len(keys_ingested)>0 and util.time_remaining(self.context) > c.SAVE_WINDOW_IN_SECONDS and not main_file_data.empty: main_file_data = self.__convert_to_submission_df(main_file_data) gc.collect() self.__excret(self.directory, main_filename, main_file_data, sep) self.__delete_keys(keys_ingested) elif util.time_remaining(self.context) <= c.SAVE_WINDOW_IN_SECONDS: print "Time has run out! We have less than {} seconds remaining before this lambda times out. Abandoning the S3 commit to avoid file corruption.".format(c.SAVE_WINDOW_IN_SECONDS) print "Aggregation window (Max Lambda Execution Time * {}): {} seconds".format(c.RATIO_OF_MAX_LAMBDA_TIME, timeout) print "S3 Save window: {} seconds".format(c.SAVE_WINDOW_IN_SECONDS) print "Lambda time remaining: {} seconds".format(util.time_remaining(self.context)) remaining_files = list(set(self.files) - set(keys_ingested)) if len(remaining_files) > 0: debug_print("Re-adding the {} paths to SQS to attempt again. The paths are \n{}".format(len(remaining_files), remaining_files)) self.__add_to_sqs(remaining_files) print "I've consumed everything I can in bucket '{}'".format(self.directory) return
def print_status(self): sleep( 1 ) # at top to make sure there's time for the jobs to be saved in redis. num_jobs_remaining = ti_queues[self.queue_number].count num_jobs_done = self.num_jobs_total - num_jobs_remaining print "finished {done} jobs in {elapsed} min. {left} left.".format( done=num_jobs_done, elapsed=round(elapsed(self.start_time) / 60, 1), left=num_jobs_remaining) self.number_of_prints += 1 if self.number_of_prints % self.seconds_between_chunks == self.seconds_between_chunks - 1: num_jobs_finished_this_chunk = num_jobs_done - self.last_chunk_num_jobs_completed if not num_jobs_finished_this_chunk: print "No jobs finished this chunk... :/" else: chunk_elapsed = elapsed(self.last_chunk_start_time) jobs_per_hour_this_chunk = num_jobs_finished_this_chunk / float( chunk_elapsed / 3600) predicted_mins_to_finish = round( (num_jobs_remaining / float(jobs_per_hour_this_chunk)) * 60, 1) print "We're doing {} jobs per hour. At this rate, done in {}min\n".format( int(jobs_per_hour_this_chunk), predicted_mins_to_finish) self.last_chunk_start_time = time() self.last_chunk_num_jobs_completed = num_jobs_done return num_jobs_remaining
def mk_mitrals(model): ''' Create all the mitrals specified by mitral_gids set.''' model.mitrals = {} for gid in model.mitral_gids: m = mkmitral.mkmitral(gid) model.mitrals.update({gid : m}) util.elapsed('%d mitrals created and connections to mitrals determined'%int(pc.allreduce(len(model.mitrals),1)))
def build_net_round_robin(model, connection_file): import custom_params model.mitral_gids = set(range(0, min(635, custom_params.customMitralCount))) model.granule_gids = set( range( max(model.mitral_gids) + 1, min( 122166, custom_params.customMitralCount * custom_params.customGranulesPerMitralCount))) model.gids = model.mitral_gids.union(model.granule_gids) enter = h.startsw() dc.mk_mitrals(model) #return # removing as per M. Migliore's email read_mconnection_info(model, connection_file) dc.mk_gconnection_info(model) model.gids = model.mitral_gids.copy() model.gids.update(model.granule_gids) register_mitrals(model) build_granules(model) register_granules(model) build_synapses(model) elapsed('build_net_round_robin') if rank == 0: print "round robin setuptime ", h.startsw() - t_begin
def gets_a_pdf(link, base_url): if is_purchase_link(link): return False absolute_url = get_link_target(link, base_url) start = time() with closing(requests.get(absolute_url, stream=True, timeout=5, verify=False)) as r: if resp_is_pdf(r): print u"http header says this is a PDF. took {}s from {}".format(elapsed(start), absolute_url) return True # some publishers send a pdf back wrapped in an HTML page using frames. # this is where we detect that, using each publisher's idiosyncratic templates. # we only check based on a whitelist of publishers, because downloading this whole # page (r.content) is expensive to do for everyone. if 'onlinelibrary.wiley.com' in absolute_url: # = closed journal http://doi.org/10.1111/ele.12585 # = open journal http://doi.org/10.1111/ele.12587 if '<iframe' in r.content: print u"this is a Wiley 'enhanced PDF' page. took {}s".format(elapsed(start)) return True elif 'ieeexplore' in absolute_url: # (this is a good example of one dissem.in misses) # = open journal http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6740844 # = closed journal http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6045214 if '<frame' in r.content: print u"this is a IEEE 'enhanced PDF' page. took {}s".format(elapsed(start)) return True print u"we've decided this ain't a PDF. took {}s".format(elapsed(start)) return False
def build_granules(model): '''build granules''' model.granules = {} for gid in model.granule_gids: g = mkgranule(gid) model.granules.update({gid : g}) elapsed('%d granules built'%int(pc.allreduce(len(model.granules),1)))
def load_campaign(filename, campaign=None, limit=None): with open("data/" + filename, "r") as f: lines = f.read().split("\n") print "found {} ORCID lines".format(len(lines)) print len(lines) if limit: lines = lines[:limit] total_start = time() row_num = 0 for line in lines: row_num += 1 # can have # as comments if line.startswith("#"): print "skipping comment line" continue loop_start = time() email = None if "," in line: (dirty_orcid, email, twitter) = line.split(",") else: dirty_orcid = line try: orcid_id = clean_orcid(dirty_orcid) except NoOrcidException: try: print u"\n\nWARNING: no valid orcid_id in line {}; skipping\n\n".format( line) except UnicodeDecodeError: print u"\n\nWARNING: no valid orcid_id and line throws UnicodeDecodeError; skipping\n\n" continue my_person = Person.query.filter_by(orcid_id=orcid_id).first() if my_person: print u"row {}, already have person {}, skipping".format( row_num, orcid_id) else: print u"row {}, making person {}".format(row_num, orcid_id) my_person = make_person(orcid_id, high_priority=False) my_person.campaign = campaign my_person.email = email my_person.twitter = twitter db.session.merge(my_person) commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on {}".format(my_person.orcid_id) print "row {}: finished {} in {}s\n".format(row_num, orcid_id, elapsed(loop_start)) print "finished load_campaign on {} profiles in {}s\n".format( len(lines), elapsed(total_start))
def register_blanes(model): '''register mitrals''' for gid in model.blanes: if h.section_exists("soma", model.blanes[gid]): s = model.blanes[gid].soma pc.set_gid2node(gid, rank) pc.cell(gid, h.NetCon(s(1)._ref_v, None, sec=s)) elapsed('blanes registered')
def output(): wtime = h.startsw() for vmrec, filename in vmrecordings: f = open(filename, 'w') for j in range(int(vmrec.size())): f.write('%g %g\n' % (tvec.x[j], vmrec.x[j])) f.close() util.elapsed('vm recorded write time %.g' % (h.startsw() - wtime))
def get_package_specific_scenario_data_from_db(package_id): timing = [] section_time = time() counter_dict = get_counter_totals_from_db(package_id) timing.append(("time from db: counter", elapsed(section_time, 2))) section_time = time() command = """select citing.issn_l, citing.year::int, sum(num_citations) as num_citations from jump_citing citing join jump_grid_id institution_grid on citing.grid_id = institution_grid.grid_id join jump_account_package institution_package on institution_grid.institution_id = institution_package.institution_id where citing.year < 2019 and institution_package.package_id='{package_id}' and (citing.issn_l in (select distinct issn_l from jump_counter where package_id='{package_id}')) group by citing.issn_l, year""".format(package_id=package_id) citation_rows = None with get_db_cursor() as cursor: cursor.execute(command) citation_rows = cursor.fetchall() citation_dict = defaultdict(dict) for row in citation_rows: citation_dict[row["issn_l"]][row["year"]] = round(row["num_citations"]) timing.append(("time from db: citation_rows", elapsed(section_time, 2))) section_time = time() command = """ select authorship.issn_l, authorship.year::int, sum(num_authorships) as num_authorships from jump_authorship authorship join jump_grid_id institution_grid on authorship.grid_id = institution_grid.grid_id join jump_account_package institution_package on institution_grid.institution_id = institution_package.institution_id where authorship.year < 2019 and institution_package.package_id='{package_id}' and (authorship.issn_l in (select distinct issn_l from jump_counter where package_id='{package_id}')) group by authorship.issn_l, year""".format(package_id=package_id) authorship_rows = None with get_db_cursor() as cursor: cursor.execute(command) authorship_rows = cursor.fetchall() authorship_dict = defaultdict(dict) for row in authorship_rows: authorship_dict[row["issn_l"]][row["year"]] = round( row["num_authorships"]) timing.append(("time from db: authorship_rows", elapsed(section_time, 2))) section_time = time() data = { "timing": timing, "counter_dict": counter_dict, "citation_dict": citation_dict, "authorship_dict": authorship_dict } return data
def mk_gconnection_info_part2(model): #transfer the gconnection info to the proper rank and make granule_gids set model.rank_gconnections = all2all(model.rank_gconnections) util.elapsed('rank_gconnections known') model.granule_gids = set([ i[3] for r in model.rank_gconnections for i in model.rank_gconnections[r] ]) util.elapsed('granule gids known on each rank')
def load_campaign(filename, campaign=None, limit=None): with open("data/" + filename, "r") as f: lines = f.read().split("\n") print "found {} ORCID lines".format(len(lines)) print len(lines) if limit: lines = lines[:limit] total_start = time() row_num = 0 for line in lines: row_num += 1 # can have # as comments if line.startswith("#"): print "skipping comment line" continue loop_start = time() email = None if "," in line: (dirty_orcid, email, twitter) = line.split(",") else: dirty_orcid = line try: orcid_id = clean_orcid(dirty_orcid) except NoOrcidException: try: print u"\n\nWARNING: no valid orcid_id in line {}; skipping\n\n".format(line) except UnicodeDecodeError: print u"\n\nWARNING: no valid orcid_id and line throws UnicodeDecodeError; skipping\n\n" continue my_person = Person.query.filter_by(orcid_id=orcid_id).first() if my_person: print u"row {}, already have person {}, skipping".format(row_num, orcid_id) else: print u"row {}, making person {}".format(row_num, orcid_id) my_person = make_person(orcid_id, store_in_db=True) my_person.campaign = campaign my_person.email = email my_person.twitter = twitter db.session.merge(my_person) commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail on {}".format(my_person.orcid_id) print "row {}: finished {} in {}s\n".format(row_num, orcid_id, elapsed(loop_start)) print "finished load_campaign on {} profiles in {}s\n".format(len(lines), elapsed(total_start))
def mk_mconnection_info(model): r = {} GL_to_GCs = {} to_conn = [] cilist = [] # initialization for gid in model.mitrals.keys(): #+model.mtufted.keys(): r[gid] = params.ranstream(gid, params.stream_latdendconnect) # init rng glomid = mgid2glom(gid) #params.cellid2glomid(gid) # init GCs connected to GL if glomid not in GL_to_GCs: GL_to_GCs[glomid] = set() # lateral dendrites positions for cellid, cell in model.mitrals.items(): #+model.mtufted.values(): to_conn += latconn.lateral_connections(cellid, cell) ntot_conn = pc.allreduce(len(to_conn),1) # all connections # connect to granule cells it = 0 while pc.allreduce(len(to_conn), 2) > 0: connect2gc(to_conn, r, GL_to_GCs) # good connect vs to redo and update GL_to_GCs _cilist, to_conn1 = detect_intraglom_conn(to_conn, GL_to_GCs) #_cilist, to_conn2 = detect_over_connected_gc(_cilist) #to_conn = to_conn1 + to_conn2 to_conn = to_conn1 cilist += _cilist it += 1 ntot_conn = pc.allreduce(len(cilist),1)/ntot_conn # fill the model data MCconn = 0 mTCconn = 0 for ci in cilist: #if params.gid_is_mitral(ci[0]): conns = model.mconnections MCconn += 1 #elif params.gid_is_mtufted(ci[0]): # conns = model.mt_connections # mTCconn += 1 if ci[0] not in conns: conns[ci[0]] = [] conns[ci[0]].append(ci) util.elapsed('Mitral %d and mTufted %d cells connection infos. generated (it=%d,err=%.3g%%)'%(int(pc.allreduce(MCconn,1)),\ int(pc.allreduce(mTCconn,1)),\ int(pc.allreduce(it,2)),\ (1-ntot_conn)*100))
def register_mitrals(model): '''register mitrals''' for gid in model.mitrals: if h.section_exists("soma", model.mitrals[gid]): s = model.mitrals[gid].soma pc.set_gid2node(gid, rank) pc.cell(gid, h.NetCon(s(1)._ref_v, None, sec=s)) if not mpiece_exists(gid): # must not be doing multisplit wholemitral(gid, model.mitrals[gid]) elapsed('mitrals registered')
def register_mitrals(model): '''register mitrals''' for gid in model.mitrals: if h.section_exists("initialseg", model.mitrals[gid]): s = model.mitrals[gid].initialseg pc.set_gid2node(gid, rank) pc.cell(gid, h.NetCon(s(1)._ref_v, None, sec=s)) if not mpiece_exists(gid): # must not be doing multisplit wholemitral(gid, model.mitrals[gid]) elapsed('mitrals registered')
def update_fn(self, cls, method_name, objects, index=1): # we are in a fork! dispose of our engine. # will get a new one automatically # if is pooling, need to do .dispose() instead db.engine.dispose() start = time() num_obj_rows = len(objects) # logger.info(u"{pid} {repr}.{method_name}() got {num_obj_rows} objects in {elapsed} seconds".format( # pid=os.getpid(), # repr=cls.__name__, # method_name=method_name, # num_obj_rows=num_obj_rows, # elapsed=elapsed(start) # )) for count, obj in enumerate(objects): start_time = time() if obj is None: return None method_to_run = getattr(obj, method_name) # logger.info(u"***") logger.info(u"*** #{count} starting {repr}.{method_name}() method".format( count=count + (num_obj_rows*index), repr=obj, method_name=method_name )) method_to_run() logger.info(u"finished {repr}.{method_name}(). took {elapsed} seconds".format( repr=obj, method_name=method_name, elapsed=elapsed(start_time, 4) )) # for handling the queue if not (method_name == "update" and obj.__class__.__name__ == "Pub"): obj.finished = datetime.datetime.utcnow().isoformat() # db.session.merge(obj) start_time = time() commit_success = safe_commit(db) if not commit_success: logger.info(u"COMMIT fail") logger.info(u"commit took {} seconds".format(elapsed(start_time, 2))) db.session.remove() # close connection nicely return None # important for if we use this on RQ
def recompute_journal_metadata(): journals_raw = JournalsDBRaw.query.all() print len(journals_raw) new_computed_journals = [] print "making backups and getting tables ready to run" with get_db_cursor() as cursor: cursor.execute("drop table journalsdb_raw_bak_yesterday;") cursor.execute("drop table journalsdb_computed_bak_yesterday;") cursor.execute( "create table journalsdb_raw_bak_yesterday as (select * from journalsdb_raw);" ) cursor.execute( "create table journalsdb_computed_bak_yesterday as (select * from journalsdb_computed);" ) # do it as its own to force commit with get_db_cursor() as cursor: # don't truncate raw! is populated by xplenty. # further more truncate hangs, so do truncation this way instead cursor.execute("delete from journalsdb_computed;") print "tables ready for insertion" for journal_raw in journals_raw: new_journal_metadata = JournalMetadata(journal_raw) new_computed_journals.append(new_journal_metadata) print "starting commits" start_time = time() insert_values_list = [j.get_insert_values() for j in new_computed_journals] command_start = u"""INSERT INTO journalsdb_computed ({}) VALUES """.format( ",".join(JournalMetadata.get_insert_column_names())) with get_db_cursor() as cursor: i = 0 for short_values_list in chunks(insert_values_list, 1000): values_list_string = u",".join(short_values_list) q = u"{} {};".format(command_start, values_list_string) cursor.execute(q) i += 1 print i print u"done committing journals, took {} seconds total".format( elapsed(start_time)) print u"now refreshing flat view" with get_db_cursor() as cursor: cursor.execute("refresh materialized view journalsdb_computed_flat;") cursor.execute("analyze journalsdb_computed;") print u"done writing to db, took {} seconds total".format( elapsed(start_time))
def build_net_round_robin(model, connection_file): enter = h.startsw() dc.mk_mitrals(model) read_mconnection_info(model, connection_file) dc.mk_gconnection_info(model) model.gids = model.mitral_gids.copy() model.gids.update(model.granule_gids) register_mitrals(model) build_granules(model) register_granules(model) build_synapses(model) elapsed('build_net_round_robin') if rank == 0: print "round robin setuptime ", h.startsw() - t_begin
def mk_b2g_connections(): # gid_blanes_existing = set([x[1] for x in params.glom2blanes ]) getmodel().blanes2gc_connections.clear() elapsed('\t%d granules are generated' % pc.allreduce(len(getmodel().granules), 1)) for ggid, blanes_gid, factor in load_blanes_dic('blanes6.dic'): getmodel().blanes2gc_connections.add((ggid, blanes_gid, factor)) # for ggid in getmodel().granules: # for blanes_gid in gid_blanes_existing: # getmodel().blanes2gc_connections.add((ggid, blanes_gid)) elapsed('%d blanes to granule connections generated' % pc.allreduce(len(getmodel().blanes2gc_connections), 1))
def scroll_through_all_dois(query_doi=None, first=None, last=None, today=False, week=False, chunk_size=1000): # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service headers = {"Accept": "application/json", "User-Agent": "mailto:[email protected]"} if first: base_url = "https://api.crossref.org/works?filter=from-created-date:{first},until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}" else: base_url = "https://api.crossref.org/works?filter=until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}" next_cursor = "*" has_more_responses = True number_added = 0 while has_more_responses: has_more_responses = False start_time = time() url = base_url.format( first=first, last=last, rows=chunk_size, next_cursor=next_cursor) logger.info(u"calling url: {}".format(url)) resp = requests.get(url, headers=headers) logger.info(u"getting crossref response took {} seconds. url: {}".format(elapsed(start_time, 2), url)) if resp.status_code != 200: logger.info(u"error in crossref call, status_code = {}".format(resp.status_code)) return number_added resp_data = resp.json()["message"] next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if resp_data["items"] and len(resp_data["items"]) == chunk_size: has_more_responses = True dois_from_api = [clean_doi(api_raw["DOI"]) for api_raw in resp_data["items"]] added_pubs = add_new_pubs_from_dois(dois_from_api) if dois_from_api: logger.info(u"got {} dois from api".format(len(dois_from_api))) if added_pubs: logger.info(u"{}: saved {} new pubs, including {}".format( first, len(added_pubs), added_pubs[-2:])) number_added += len(added_pubs) logger.info(u"loop done in {} seconds".format(elapsed(start_time, 2))) return number_added
def do_a_loop(first=None, last=None, url=None, threads=0, chunk_size=None): just_random = True loop_start = time() es = set_up_elastic(url) if just_random: random_query_dict["from"] = int(random.random() * 7999) results = es.search(index=INDEX_NAME, body=random_query_dict, request_timeout=10000) else: # different every loop query_dict["from"] = int(random.random() * 7999) results = es.search(index=INDEX_NAME, body=query_dict, request_timeout=10000) # print u"search body:\n{}".format(query) print u"took {}s to search ES".format(elapsed(loop_start, 2)) records_to_save = [] # decide if should stop looping after this if not results["hits"]["hits"]: print "no hits! exiting" sys.exit() base_results = [] for base_hit in results["hits"]["hits"]: base_hit_doc = base_hit["_source"] base_results.append(BaseResult(base_hit_doc)) scrape_start = time() # don't do scrape right now # targets = [base_result.scrape_for_fulltext for base_result in base_results] # call_targets_in_parallel(targets) # print u"scraping {} webpages took {}s".format(len(base_results), elapsed(scrape_start, 2)) targets = [base_result.set_base1s for base_result in base_results] call_targets_in_parallel(targets) for base_result in base_results: base_result.set_fulltext_urls() records_to_save.append(base_result.make_action_record()) # print "len of records_to_save", len(records_to_save) # print "records_to_save:", records_to_save save_records_in_es(es, records_to_save, threads, chunk_size) print "** took {}s to do {}, {:,} remaining\n".format( elapsed(loop_start, 2), len(base_results), results["hits"]["total"])
def scroll_through_all_dois(query_doi=None, first=None, last=None, today=False, week=False, chunk_size=1000): # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service headers={"Accept": "application/json", "User-Agent": "mailto:[email protected]"} if first: base_url = "https://api.crossref.org/works?filter=from-created-date:{first},until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}" else: base_url = "https://api.crossref.org/works?filter=until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}" next_cursor = "*" has_more_responses = True number_added = 0 while has_more_responses: has_more_responses = False start_time = time() url = base_url.format( first=first, last=last, rows=chunk_size, next_cursor=next_cursor) logger.info(u"calling url: {}".format(url)) resp = requests.get(url, headers=headers) logger.info(u"getting crossref response took {} seconds. url: {}".format(elapsed(start_time, 2), url)) if resp.status_code != 200: logger.info(u"error in crossref call, status_code = {}".format(resp.status_code)) return number_added resp_data = resp.json()["message"] next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if resp_data["items"] and len(resp_data["items"]) == chunk_size: has_more_responses = True dois_from_api = [clean_doi(api_raw["DOI"]) for api_raw in resp_data["items"]] added_pubs = add_new_pubs_from_dois(dois_from_api) if dois_from_api: logger.info(u"got {} dois from api".format(len(dois_from_api))) if added_pubs: logger.info(u"{}: saved {} new pubs, including {}".format( first, len(added_pubs), added_pubs[-2:])) number_added += len(added_pubs) logger.info(u"loop done in {} seconds".format(elapsed(start_time, 2))) return number_added
def mk_gconnection_info(model): mk_gconnection_info_part1(model) mk_gconnection_info_part2(model) # # Save full network Mitral-Granule connections # mitral2granule = {} # for mgid in model.mitral_gids: # mitral2granule.update({mgid: [gc[3] for gc in model.mconnections[mgid]]}) # # import cPickle as pickle # with open('mitral2granule.p', 'wb') as fp: # pickle.dump(mitral2granule, fp) util.elapsed('mk_gconnection_info (#granules = %d)'%int(pc.allreduce(len(model.granule_gids),1)))
def update_fn(cls, method_name, obj_id_list, shortcut_data=None, index=1): # we are in a fork! dispose of our engine. # will get a new one automatically db.engine.dispose() start = time() q = db.session.query(cls).options(orm.undefer('*')).filter(cls.id.in_(obj_id_list)) obj_rows = q.all() num_obj_rows = len(obj_rows) print "{repr}.{method_name}() got {num_obj_rows} objects in {elapsed}sec".format( repr=cls.__name__, method_name=method_name, num_obj_rows=num_obj_rows, elapsed=elapsed(start) ) for count, obj in enumerate(obj_rows): start_time = time() if obj is None: return None method_to_run = getattr(obj, method_name) print u"\n***\n{count}: starting {repr}.{method_name}() method".format( count=count + (num_obj_rows*index), repr=obj, method_name=method_name ) if shortcut_data: method_to_run(shortcut_data) else: method_to_run() print u"finished {repr}.{method_name}(). took {elapsed}sec".format( repr=obj, method_name=method_name, elapsed=elapsed(start_time, 4) ) commit_success = safe_commit(db) if not commit_success: print u"COMMIT fail" db.session.remove() # close connection nicely return None # important for if we use this on RQ
def update_fn(cls, method_name, obj_id_list, shortcut_data=None): # we are in a fork! dispose of our engine. # will get a new one automatically db.engine.dispose() start = time() q = db.session.query(cls).filter(cls.id.in_(obj_id_list)) if cls.__name__ == "Person": q = q.options(person_load_options()) obj_rows = q.all() num_obj_rows = len(obj_rows) print "{repr}.{method_name}() got {num_obj_rows} objects in {elapsed}sec".format( repr=cls.__name__, method_name=method_name, num_obj_rows=num_obj_rows, elapsed=elapsed(start) ) for obj in obj_rows: start_time = time() if obj is None: return None method_to_run = getattr(obj, method_name) print u"\nstarting {repr}.{method_name}() method".format( repr=obj, method_name=method_name ) if shortcut_data: method_to_run(shortcut_data) else: method_to_run() print u"finished {repr}.{method_name}(). took {elapsed}sec".format( repr=obj, method_name=method_name, elapsed=elapsed(start_time, 4) ) db.session.commit() db.session.remove() # close connection nicely return None # important for if we use this on RQ
def harvest(self, **kwargs): # pragma: no cover """Make HTTP requests to the OAI server. :param kwargs: OAI HTTP parameters. :rtype: :class:`sickle.OAIResponse` """ start_time = time() for _ in range(self.max_retries): if self.http_method == 'GET': payload_str = "&".join("%s=%s" % (k, v) for k, v in kwargs.items()) url_without_encoding = u"{}?{}".format(self.endpoint, payload_str) http_response = requests.get(url_without_encoding, **self.request_args) self.http_response_url = http_response.url else: http_response = requests.post(self.endpoint, data=kwargs, **self.request_args) self.http_response_url = http_response.url if http_response.status_code == 503: retry_after = self.RETRY_SECONDS logger.info("HTTP 503! Retrying after %d seconds..." % retry_after) sleep(retry_after) else: logger.info("took {} seconds to call pmh url: {}".format( elapsed(start_time), http_response.url)) http_response.raise_for_status() if self.encoding: http_response.encoding = self.encoding return OAIResponse(http_response, params=kwargs)
def get_search_query(query): start_time = time() my_pubs = fulltext_search_title(query) response = [my_pub.to_dict_search() for my_pub in my_pubs] sorted_response = sorted(response, key=lambda k: k['score'], reverse=True) elapsed_time = elapsed(start_time, 3) return jsonify({"results": sorted_response, "elapsed_seconds": elapsed_time})
def leaderboard(): filters_dict = make_filters_dict(request.args) page_size = request.args.get("page_size", "25") start = time() num_total, leaders = get_leaders(filters=filters_dict, page_size=int(page_size)) leaders_list = [leader.as_snippet for leader in leaders] ret_dict = { "num_returned": len(leaders_list), "num_total": num_total, "list": leaders_list, "type": filters_dict["type"], "filters": filters_dict } if "tag" in filters_dict: tag_obj = Tags.query.filter( Tags.unique_tag == filters_dict["tag"]).first() ret_dict["related_tags"] = tag_obj.related_tags ret = json_resp_from_thing(ret_dict) elapsed_time = elapsed(start) ret.headers["x-elapsed"] = elapsed_time return ret
def check_pdf_urls(pdf_urls): for url in pdf_urls: make_transient(url) # free up the connection while doing net IO safe_commit(db) db.engine.dispose() req_pool = get_request_pool() checked_pdf_urls = req_pool.map(get_pdf_url_status, pdf_urls, chunksize=1) req_pool.close() req_pool.join() row_dicts = [x.__dict__ for x in checked_pdf_urls] for row_dict in row_dicts: row_dict.pop('_sa_instance_state') db.session.bulk_update_mappings(PdfUrl, row_dicts) start_time = time() commit_success = safe_commit(db) if not commit_success: logger.info(u"COMMIT fail") logger.info(u"commit took {} seconds".format(elapsed(start_time, 2)))
def get_search_query(): query = request.args.get("query", None) is_oa = request.args.get("is_oa", None) if is_oa is not None: try: is_oa = str_to_bool(is_oa) except ValueError: if is_oa == 'null': is_oa = None else: abort_json(400, "is_oa must be 'true' or 'false'") if not query: abort_json(400, "query parameter is required") start_time = time() response = fulltext_search_title(query, is_oa) sorted_response = sorted(response, key=lambda k: k['score'], reverse=True) for api_response in sorted_response: doi = api_response['response']['doi'] version_suffix = re.findall(ur'[./](v\d+)$', doi, re.IGNORECASE) if version_suffix: title = api_response['response']['title'] title = u'{} ({})'.format(title, version_suffix[0].upper()) api_response['response']['title'] = title elapsed_time = elapsed(start_time, 3) return jsonify({"results": sorted_response, "elapsed_seconds": elapsed_time})
def add_dois_to_queue_from_query(where, job_type): logger.info(u"adding all dois, this may take a while") start = time() table_name = "doi_queue" # run_sql(db, "drop table {} cascade".format(table_name(job_type))) # create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started, null::text as dyno from crossref)".format( # table_name(job_type)) create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started from pub);".format( table_name) if where: create_table_command = create_table_command.replace( "from pub)", "from pub where {})".format(where)) run_sql(db, create_table_command) create_table_command += """ alter table {table_name} alter column rand set default random(); CREATE INDEX {table_name}_id_idx ON {table_name} USING btree (id); CREATE INDEX {table_name}_finished_null_rand_idx on {table_name} (rand) where finished is null; CREATE INDEX {table_name}_started_null_rand_idx ON {table_name} USING btree (rand, started) WHERE started is null; -- from https://lob.com/blog/supercharge-your-postgresql-performance -- vacuums and analyzes every ten million rows ALTER TABLE {table_name} SET (autovacuum_vacuum_scale_factor = 0.0); ALTER TABLE {table_name} SET (autovacuum_vacuum_threshold = 10000000); ALTER TABLE {table_name} SET (autovacuum_analyze_scale_factor = 0.0); ALTER TABLE {table_name} SET (autovacuum_analyze_threshold = 10000000); """.format(table_name=table_name) for command in create_table_command.split(";"): run_sql(db, command) command = """create or replace view export_queue as SELECT id AS doi, updated AS updated, response_jsonb->>'evidence' AS evidence, response_jsonb->>'oa_status' AS oa_color, response_jsonb->>'free_fulltext_url' AS best_open_url, response_jsonb->>'year' AS year, response_jsonb->>'found_hybrid' AS found_hybrid, response_jsonb->>'found_green' AS found_green, response_jsonb->>'error' AS error, response_jsonb->>'is_boai_license' AS is_boai_license, replace(api->'_source'->>'journal', ' ', '') AS journal, replace(api->'_source'->>'publisher', ' ', '') AS publisher, api->'_source'->>'title' AS title, api->'_source'->>'subject' AS subject, response_jsonb->>'license' AS license FROM pub where id in (select id from {table_name})""".format( table_name=table_name(job_type)) # if job_type: # command_with_hybrid = command.replace("response_jsonb", "response_with_hybrid").replace("export_queue", "export_queue_with_hybrid") run_sql(db, command) # they are already lowercased logger.info(u"add_dois_to_queue_from_query done in {} seconds".format( elapsed(start, 1))) print_status(job_type)
def _grep_for_dep_lines(self, query_str, include_globs, exclude_globs): arg_list = ['zipgrep', query_str, self.temp_file_name] arg_list += include_globs arg_list.append("-x") arg_list += exclude_globs start = time() try: print "Running zipgrep: '{}'".format(" ".join(arg_list)) self.dep_lines = subprocess32.check_output(arg_list, timeout=90) except subprocess32.CalledProcessError: # heroku throws an error here when there are no dep lines to find. # but it's fine. there just aren't no lines. pass except subprocess32.TimeoutExpired: # too many files, we'll skip it and move on. self.error = "grep_timeout" pass finally: self.grep_elapsed = elapsed(start, 4) #print "found these dep lines: {}".format(self.dep_lines) print "finished dep lines search in {} sec".format( self.grep_elapsed)
def add_repos_from_remote_csv(csv_url, language): start = time() print "going to go get file" response = requests.get(csv_url, stream=True) index = 0 for github_url in response.iter_lines(chunk_size=1000): login, repo_name = login_and_repo_name_from_url(github_url) if login and repo_name: repo = GithubRepo( login=login, repo_name=repo_name, language=language ) print repo db.session.merge(repo) index += 1 if index % 1000 == 0: db.session.commit() print "flushing on index {index}, elapsed: {elapsed}".format( index=index, elapsed=elapsed(start)) db.session.commit()
def _grep_for_dep_lines(self, query_str, include_globs, exclude_globs): arg_list =['zipgrep', query_str, self.temp_file_name] arg_list += include_globs arg_list.append("-x") arg_list += exclude_globs start = time() try: print "Running zipgrep: '{}'".format(" ".join(arg_list)) self.dep_lines = subprocess32.check_output( arg_list, timeout=90 ) except subprocess32.CalledProcessError: # heroku throws an error here when there are no dep lines to find. # but it's fine. there just aren't no lines. pass except subprocess32.TimeoutExpired: # too many files, we'll skip it and move on. self.error = "grep_timeout" pass finally: self.grep_elapsed = elapsed(start, 4) #print "found these dep lines: {}".format(self.dep_lines) print "finished dep lines search in {} sec".format(self.grep_elapsed)
def leaderboard(): filters_dict = make_filters_dict(request.args) page_size = request.args.get("page_size", "25") start = time() num_total, leaders = get_leaders( filters=filters_dict, page_size=int(page_size) ) leaders_list = [leader.as_snippet for leader in leaders] ret_dict = { "num_returned": len(leaders_list), "num_total": num_total, "list": leaders_list, "type": filters_dict["type"], "filters": filters_dict } if "tag" in filters_dict: tag_obj = Tags.query.filter(Tags.unique_tag==filters_dict["tag"]).first() ret_dict["related_tags"] = tag_obj.related_tags ret = json_resp_from_thing(ret_dict) elapsed_time = elapsed(start) ret.headers["x-elapsed"] = elapsed_time return ret
def log_request(resp): if request.endpoint != "get_doi_endpoint": return logging_start_time = time() try: results = json.loads(resp.get_data())["results"][0] except (ValueError, RuntimeError, KeyError): # don't bother logging if no results return oa_color = results["oa_color"] if not oa_color: oa_color = "gray" body = { "timestamp": datetime.utcnow().isoformat(), "elapsed": elapsed(g.request_start_time, 2), "ip": get_ip(), "status_code": resp.status_code, "email": request.args.get("email", None), "doi": results["doi"], "year": results.get("year", None), "oa_color": oa_color } h = { "content-type": "text/json", "X-Forwarded-For": get_ip() } url = "http://logs-01.loggly.com/inputs/6470410b-1d7f-4cb2-a625-72d8fa867d61/tag/{}/".format( oa_color) requests.post(url, headers=h, data=json.dumps(body))
def harvest(self, **kwargs): # pragma: no cover """Make HTTP requests to the OAI server. :param kwargs: OAI HTTP parameters. :rtype: :class:`sickle.OAIResponse` """ start_time = time() for _ in range(self.max_retries): if self.http_method == 'GET': payload_str = "&".join("%s=%s" % (k,v) for k,v in kwargs.items()) url_without_encoding = u"{}?{}".format(self.endpoint, payload_str) http_response = requests.get(url_without_encoding, **self.request_args) self.http_response_url = http_response.url else: http_response = requests.post(self.endpoint, data=kwargs, **self.request_args) self.http_response_url = http_response.url if http_response.status_code == 503: retry_after = self.RETRY_SECONDS logger.info("HTTP 503! Retrying after %d seconds..." % retry_after) sleep(retry_after) else: logger.info("took {} seconds to call pmh url: {}".format(elapsed(start_time), http_response.url)) http_response.raise_for_status() if self.encoding: http_response.encoding = self.encoding return OAIResponse(http_response, params=kwargs)
def set_data_for_all_products(self, method_name, high_priority=False, include_products=None): start_time = time() threads = [] # use all products unless passed a specific set if not include_products: include_products = self.all_products # start a thread for each product for work in include_products: method = getattr(work, method_name) process = threading.Thread(target=method, args=[high_priority]) process.start() threads.append(process) # wait till all work is done for process in threads: process.join() # now go see if any of them had errors # need to do it this way because can't catch thread failures; have to check # object afterwards instead to see if they logged failures for work in include_products: if work.error: # don't print out doi here because that could cause another bug # print u"setting person error; {} for product {}".format(work.error, work.id) self.error = work.error print u"finished {method_name} on {num} products in {sec}s".format( method_name=method_name.upper(), num = len(include_products), sec = elapsed(start_time, 2) )
def refresh(self, high_priority=False): print u"* refreshing {} ({})".format(self.orcid_id, self.full_name) self.error = None start_time = time() try: print u"** calling call_apis" self.call_apis(high_priority=high_priority) print u"** calling calculate" self.calculate() print u"** finished refreshing all {num} products for {orcid_id} ({name}) in {sec}s".format( orcid_id=self.orcid_id, name=self.full_name, num=len(self.all_products), sec=elapsed(start_time) ) except (KeyboardInterrupt, SystemExit): # let these ones through, don't save anything to db raise except requests.Timeout: self.error = "requests timeout" except OrcidDoesNotExist: self.invalid_orcid = True self.error = "invalid orcid" except Exception: logging.exception("refresh error") self.error = "refresh error" print u"in generic exception handler, so rolling back in case it is needed" db.session.rollback() finally: self.updated = datetime.datetime.utcnow().isoformat() if self.error: print u"ERROR refreshing person {}: {}".format(self.id, self.error)
def add_dois_to_queue_from_query(where, job_type): logger.info(u"adding all dois, this may take a while") start = time() table_name = "doi_queue" # run_sql(db, "drop table {} cascade".format(table_name(job_type))) # create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started, null::text as dyno from crossref)".format( # table_name(job_type)) create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started from pub);".format( table_name) if where: create_table_command = create_table_command.replace("from pub)", "from pub where {})".format(where)) run_sql(db, create_table_command) create_table_command += """ alter table {table_name} alter column rand set default random(); CREATE INDEX {table_name}_id_idx ON {table_name} USING btree (id); CREATE INDEX {table_name}_finished_null_rand_idx on {table_name} (rand) where finished is null; CREATE INDEX {table_name}_started_null_rand_idx ON {table_name} USING btree (rand, started) WHERE started is null; -- from https://lob.com/blog/supercharge-your-postgresql-performance -- vacuums and analyzes every ten million rows ALTER TABLE {table_name} SET (autovacuum_vacuum_scale_factor = 0.0); ALTER TABLE {table_name} SET (autovacuum_vacuum_threshold = 10000000); ALTER TABLE {table_name} SET (autovacuum_analyze_scale_factor = 0.0); ALTER TABLE {table_name} SET (autovacuum_analyze_threshold = 10000000); """.format( table_name=table_name) for command in create_table_command.split(";"): run_sql(db, command) command = """create or replace view export_queue as SELECT id AS doi, updated AS updated, response_jsonb->>'evidence' AS evidence, response_jsonb->>'oa_status' AS oa_color, response_jsonb->>'free_fulltext_url' AS best_open_url, response_jsonb->>'year' AS year, response_jsonb->>'found_hybrid' AS found_hybrid, response_jsonb->>'found_green' AS found_green, response_jsonb->>'error' AS error, response_jsonb->>'is_boai_license' AS is_boai_license, replace(api->'_source'->>'journal', ' ', '') AS journal, replace(api->'_source'->>'publisher', ' ', '') AS publisher, api->'_source'->>'title' AS title, api->'_source'->>'subject' AS subject, response_jsonb->>'license' AS license FROM pub where id in (select id from {table_name})""".format( table_name=table_name(job_type)) # if job_type: # command_with_hybrid = command.replace("response_jsonb", "response_with_hybrid").replace("export_queue", "export_queue_with_hybrid") run_sql(db, command) # they are already lowercased logger.info(u"add_dois_to_queue_from_query done in {} seconds".format(elapsed(start, 1))) print_status(job_type)
def set_cran_dependencies(login, repo_name): start_time = time() repo = get_repo(login, repo_name) if repo is None: return None repo.set_cran_dependencies() commit_repo(repo) print "found deps and committed. took {}sec".format(elapsed(start_time), 4) return None # important that it returns None for RQ
def set_one_requirements_pypi(login, repo_name): start_time = time() repo = get_repo(login, repo_name) if repo is None: return None repo.set_requirements_pypi() commit_repo(repo) print "cleaned requirements, committed. took {}sec".format(elapsed(start_time), 4) return None # important that it returns None for RQ
def build_synapses(model): '''construct reciprocal synapses''' model.mgrss = {} for r in model.rank_gconnections: for ci in model.rank_gconnections[r]: rsyn = mgrs.mk_mgrs(*ci[0:7]) if rsyn: model.mgrss.update({rsyn.md_gid : rsyn}) for mgid in model.mconnections: for ci in model.mconnections[mgid]: #do not duplicate if already built because granule exists on this process if not model.mgrss.has_key(mgrs.mgrs_gid(ci[0], ci[3], ci[6])): rsyn = mgrs.mk_mgrs(*ci[0:7]) if rsyn: model.mgrss.update({rsyn.md_gid : rsyn}) nmultiple = int(pc.allreduce(mgrs.multiple_cnt(), 1)) if rank == 0: print 'nmultiple = ', nmultiple detectors = h.List("ThreshDetect") elapsed('%d ThreshDetect for reciprocalsynapses constructed'%int(pc.allreduce(detectors.count(),1)))
def build_net_round_robin(model, connection_file): import custom_params model.mitral_gids = set(range(0,min(635, custom_params.customMitralCount))) model.granule_gids = set(range(max(model.mitral_gids)+1, min(122166, custom_params.customMitralCount*custom_params.customGranulesPerMitralCount))) model.gids = model.mitral_gids.union(model.granule_gids) enter = h.startsw() dc.mk_mitrals(model) #return # removing as per M. Migliore's email read_mconnection_info(model, connection_file) dc.mk_gconnection_info(model) model.gids = model.mitral_gids.copy() model.gids.update(model.granule_gids) register_mitrals(model) build_granules(model) register_granules(model) build_synapses(model) elapsed('build_net_round_robin') if rank == 0: print "round robin setuptime ", h.startsw() - t_begin
def set_cran_dependencies(self): """ using self.dependency_lines, finds all cran libs imported by repo. """ start_time = time() self.cran_dependencies = [] if not self.dep_lines: return [] lines = self.dep_lines.split("\n") import_lines = [l.split(":")[1] for l in lines if ":" in l] modules_imported = set() library_or_require_re = re.compile(ur'(?:library|require)\((.*?)[\)|,|\s]', re.IGNORECASE) for line in import_lines: for clause in line.split(";"): # print u"\nchecking this line: {}".format(clause) clean_line = clause.strip() clean_line = clean_line.replace("'", "") clean_line = clean_line.replace('"', "") clean_line = clean_line.replace(' ', "") clean_line = clean_line.replace('library.dynam', "library") clean_line = clean_line.replace('install.packages', "library") clean_line = clean_line.replace('require.package', "require") if clean_line.startswith("#"): # print "skipping, is a comment" pass # is a comment else: modules = library_or_require_re.findall(clean_line) for module in modules: modules_imported.add(module) if modules: # print "found modules", modules pass else: print "NO MODULES found in ", clean_line print "all modules found:", modules_imported self.lib_matches_raw = list(modules_imported) from models.package import CranPackage matching_cran_packages = set(CranPackage.valid_package_names(modules_imported)) # print "and here are the ones that match cran!", matching_cran_packages # print "*********here are the ones that didn't match", modules_imported - matching_cran_packages self.lib_matches_final = list(matching_cran_packages) print "done finding cran deps for {}: {} (took {}sec)".format( self.full_name, self.lib_matches_final, elapsed(start_time, 4) ) return self.lib_matches_final
def call_local_lookup_oa(self, limit_to_products=None): start_time = time() if limit_to_products: products = limit_to_products else: products = self.products for p in products: p.set_local_lookup_oa() print u"finished local step of set_fulltext_urls in {}s".format(elapsed(start_time, 2))
def main(fn, optional_args=None): start = time() # call function by its name in this module, with all args :) # http://stackoverflow.com/a/4605/596939 if optional_args: globals()[fn](*optional_args) else: globals()[fn]() print "total time to run:", elapsed(start)
def set_pypi_in_formal_only(login, repo_name): print "working on ", login, repo_name start_time = time() repo = get_repo(login, repo_name) if repo is None: return None repo.set_pypi_in_formal_only() commit_repo(repo) print "calculated pypi_in_formal_only, committed. took {}sec".format(elapsed(start_time), 4) return None # important that it returns None for RQ
def http_get(url, headers={}, read_timeout=60, connect_timeout=60, stream=False, cache_enabled=False, allow_redirects=True, publisher=None, session_id=None, ask_slowly=False): start_time = time() # reset os.environ["HTTP_PROXY"] = "" try: logger.info(u"LIVE GET on {}".format(url)) except UnicodeDecodeError: logger.info(u"LIVE GET on an url that throws UnicodeDecodeError") max_tries = 2 if ask_slowly: max_tries = 3 success = False tries = 0 r = None while not success: try: r = call_requests_get(url, headers=headers, read_timeout=read_timeout, connect_timeout=connect_timeout, stream=stream, publisher=publisher, session_id=session_id, ask_slowly=ask_slowly) success = True except (KeyboardInterrupt, SystemError, SystemExit): raise except Exception as e: # don't make this an exception log for now logger.info(u"exception in call_requests_get") tries += 1 if tries >= max_tries: logger.info(u"in http_get, tried too many times, giving up") raise else: logger.info(u"in http_get, got an exception, trying again") finally: logger.info(u"finished http_get for {} in {} seconds".format(url, elapsed(start_time, 2))) return r