Exemplo n.º 1
0
def run_ica():

	log('loading data')
	start = util.now()
	voxels, xdim, ydim, zdim = load_data()
	log('  elapsed: {}'.format(util.elapsed(start)))

	log('running independent component analysis')
	start = util.now()
	ica = decomposition.FastICA(n_components=64, max_iter=200)
	sources = ica.fit_transform(voxels)
	sources = to_dataframe(sources, load_subject_ids(), ['X{}'.format(i) for i in range(64)])
	log('  elapsed: {}'.format(util.elapsed(start)))

	log('calculating correlations between voxel and component time courses')
	start = util.now()
	correlations = []
	for voxel in voxels.columns[:32]:
		voxel = voxels[voxel]
		max_correlation = 0
		for source in sources.columns:
			source = sources[source]
			correlation = np.corrcoef(voxel, source)
			if correlation > max_correlation:
				max_correlation = correlation
		correlations.append(max_correlation)
	log('  elapsed: {}'.format(util.elapsed(start)))
Exemplo n.º 2
0
def init_gap_junctions():

    # initialize source
    for mgid in range(params.Nmitral):
        mpriden = split.mpriden(mgid)
        if mpriden:
            mpriden.push()
            pc.source_var(mpriden(0.99)._ref_v, mgid)
            h.pop_section()
            
    pc.barrier()

    # initialize targets
    for mgid in range(params.Nmitral):
        mpriden = split.mpriden(mgid)
        if mpriden:
            glomid = mgid/nmxg
            for sistermgid in range(glomid * nmxg, mgid)+range(mgid+1, (glomid+1)*nmxg):
                if pc.gid_exists(sistermgid) > 0:
                    gap = h.Gap(mpriden(0.99))
                    if sistermgid != 189:
                        getmodel().gj[(mgid, sistermgid)] = gap
                        glomid = mgid / nmxg
                        pc.target_var(gap, gap._ref_vgap, sistermgid)
    util.elapsed('Gap junctions builden')
Exemplo n.º 3
0
def build_granules(model):
    '''build granules'''
    model.granules = {}
    for gid in model.granule_gids:
        g = mkgranule(gid)
        model.granules[gid] = g
    elapsed('%d granules built' % int(pc.allreduce(len(model.granules), 1)))
Exemplo n.º 4
0
def do_a_loop(first=None, last=None, url=None, threads=0, chunk_size=None):
    es = set_up_elastic(url)
    loop_start = time()
    results = es.search(index=INDEX_NAME, body=query, request_timeout=10000)
    # print u"search body:\n{}".format(query)
    print u"took {}s to search ES. remaining: {:,}".format(
        elapsed(loop_start, 2), results["hits"]["total"])
    records_to_save = []

    # decide if should stop looping after this
    if not results['hits']['hits']:
        sys.exit()

    crossref_results = []
    for crossref_hit in results['hits']['hits']:
        crossref_hit_doc = crossref_hit["_source"]
        crossref_results.append(
            CrossrefResult(crossref_hit["_id"], crossref_hit_doc))

    for crossref_result in crossref_results:
        records_to_save.append(crossref_result.make_action_record())

    # print "records_to_save", records_to_save
    print "starting saving"
    save_records_in_es(es, records_to_save, threads, chunk_size)
    print "** {}s to do {}\n".format(elapsed(loop_start, 2),
                                     len(crossref_results))
Exemplo n.º 5
0
 def ingest(self):                    
     debug_print("Ingesting directory {}".format(self.directory))               
     debug_print("Ingesting the files \n{}".format(self.files))                    
     is_lambda =  self.context[c.KEY_LAMBDA_FUNCTION] is not None
     timeout = self.__calculate_aggregate_window_timeout(self.context[c.KEY_MAX_LAMBDA_TIME])
     target_excretion_size = self.context[c.KEY_TARGET_AGGREGATION_FILE_SIZE_IN_MB]
     compression_ratio = self.context[c.KEY_CSV_PARQUET_COMPRESSION_RATIO]
     sep = self.context[c.KEY_SEPERATOR_PARTITION]
     memory_trigger = self.context[c.KEY_AMOEBA_MEMORY_FLUSH_TRIGGER]             
     memory_used = mutil.get_memory_usage()             
     main_filename, main_file_data, main_file_size_mb = self.__get_main_aggregate_file(self.directory, sep, target_excretion_size)        
     main_file_data = self.__append(None, main_file_data)           
     keys_ingested = []
     for file in self.files:
         debug_print("\tProcessing file {}".format(file))
         key_parts = KeyParts(file, sep)
         duration = datetime.datetime.utcnow() - key_parts.filename_timestamp            
         if duration.total_seconds() < 300:
             debug_print("The file '{}' is {}s old.  It is too new and will be processed later to allow for S3 propagation.".format(file, duration.total_seconds()))
             continue
         keys_ingested.append(file)            
         data = self.__open(file, main_file_data)
         if data is None:
             continue            
         size_in_megabytes = self.__size(file)            
         main_file_data = self.__append(main_file_data, data) 
         del data
         gc.collect()
         current_dataframe_size = sys.getsizeof(main_file_data)        
         #break conditions
         #1. Memory limit exceeded
         #2. Time window exceeded
         #3. Target excretion size hit
         main_file_size_mb += size_in_megabytes
         memory_used = mutil.get_memory_usage()           
         debug_print("\t\tSize on S3: {}MB Size of new dataset: {}bytes Estimated Compression Ratio: {} Memory Used: {}% Project Compression Size {}MB  Target Excretion Size {}MB".format(size_in_megabytes, current_dataframe_size, compression_ratio, memory_used, main_file_size_mb, target_excretion_size))
         if util.elapsed(self.context) > timeout or memory_used > memory_trigger or main_file_size_mb > target_excretion_size :
             print "Elapsed", util.elapsed(self.context), "Start:", self.starttime, "Timeout:", timeout, "Has timed out:", util.elapsed(self.context) > timeout, "Mem Used %:", memory_used, "Max Memory %:", memory_trigger
             break                
     
     #only save the files if we have a reasonable amount of time remaining before the lambda timeout.
     debug_print("Time remaining: {}s".format(util.time_remaining(self.context)))    
     debug_print("There were {} keys ingested.  The keys ingested are: \n {}".format(len(keys_ingested), keys_ingested))
     if len(keys_ingested)>0 and util.time_remaining(self.context) > c.SAVE_WINDOW_IN_SECONDS and not main_file_data.empty:            
         main_file_data = self.__convert_to_submission_df(main_file_data)
         gc.collect()
         self.__excret(self.directory, main_filename, main_file_data, sep)            
         self.__delete_keys(keys_ingested)
     elif util.time_remaining(self.context) <= c.SAVE_WINDOW_IN_SECONDS:            
         print "Time has run out!  We have less than {} seconds remaining before this lambda times out.  Abandoning the S3 commit to avoid file corruption.".format(c.SAVE_WINDOW_IN_SECONDS)
         print "Aggregation window (Max Lambda Execution Time * {}): {} seconds".format(c.RATIO_OF_MAX_LAMBDA_TIME, timeout) 
         print "S3 Save window: {} seconds".format(c.SAVE_WINDOW_IN_SECONDS) 
         print "Lambda time remaining: {} seconds".format(util.time_remaining(self.context))                        
        
     remaining_files = list(set(self.files) - set(keys_ingested))
     if len(remaining_files) > 0:        
         debug_print("Re-adding the {} paths to SQS to attempt again. The paths are \n{}".format(len(remaining_files), remaining_files))               
         self.__add_to_sqs(remaining_files)        
     print "I've consumed everything I can in bucket '{}'".format(self.directory)
     return
Exemplo n.º 6
0
    def print_status(self):
        sleep(
            1
        )  # at top to make sure there's time for the jobs to be saved in redis.

        num_jobs_remaining = ti_queues[self.queue_number].count
        num_jobs_done = self.num_jobs_total - num_jobs_remaining

        print "finished {done} jobs in {elapsed} min. {left} left.".format(
            done=num_jobs_done,
            elapsed=round(elapsed(self.start_time) / 60, 1),
            left=num_jobs_remaining)
        self.number_of_prints += 1

        if self.number_of_prints % self.seconds_between_chunks == self.seconds_between_chunks - 1:

            num_jobs_finished_this_chunk = num_jobs_done - self.last_chunk_num_jobs_completed
            if not num_jobs_finished_this_chunk:
                print "No jobs finished this chunk... :/"

            else:
                chunk_elapsed = elapsed(self.last_chunk_start_time)

                jobs_per_hour_this_chunk = num_jobs_finished_this_chunk / float(
                    chunk_elapsed / 3600)
                predicted_mins_to_finish = round(
                    (num_jobs_remaining / float(jobs_per_hour_this_chunk)) *
                    60, 1)
                print "We're doing {} jobs per hour. At this rate, done in {}min\n".format(
                    int(jobs_per_hour_this_chunk), predicted_mins_to_finish)

                self.last_chunk_start_time = time()
                self.last_chunk_num_jobs_completed = num_jobs_done

        return num_jobs_remaining
Exemplo n.º 7
0
def mk_mitrals(model):
  ''' Create all the mitrals specified by mitral_gids set.'''
  model.mitrals = {}
  for gid in model.mitral_gids:
    m = mkmitral.mkmitral(gid)
    model.mitrals.update({gid : m})
  util.elapsed('%d mitrals created and connections to mitrals determined'%int(pc.allreduce(len(model.mitrals),1)))
Exemplo n.º 8
0
def build_net_round_robin(model, connection_file):

    import custom_params
    model.mitral_gids = set(range(0, min(635,
                                         custom_params.customMitralCount)))
    model.granule_gids = set(
        range(
            max(model.mitral_gids) + 1,
            min(
                122166, custom_params.customMitralCount *
                custom_params.customGranulesPerMitralCount)))
    model.gids = model.mitral_gids.union(model.granule_gids)

    enter = h.startsw()
    dc.mk_mitrals(model)
    #return # removing as per M. Migliore's email
    read_mconnection_info(model, connection_file)
    dc.mk_gconnection_info(model)
    model.gids = model.mitral_gids.copy()
    model.gids.update(model.granule_gids)
    register_mitrals(model)
    build_granules(model)
    register_granules(model)
    build_synapses(model)
    elapsed('build_net_round_robin')
    if rank == 0: print "round robin setuptime ", h.startsw() - t_begin
Exemplo n.º 9
0
def gets_a_pdf(link, base_url):
    if is_purchase_link(link):
        return False

    absolute_url = get_link_target(link, base_url)
    start = time()
    with closing(requests.get(absolute_url, stream=True, timeout=5, verify=False)) as r:
        if resp_is_pdf(r):
            print u"http header says this is a PDF. took {}s from {}".format(elapsed(start), absolute_url)
            return True

        # some publishers send a pdf back wrapped in an HTML page using frames.
        # this is where we detect that, using each publisher's idiosyncratic templates.
        # we only check based on a whitelist of publishers, because downloading this whole
        # page (r.content) is expensive to do for everyone.
        if 'onlinelibrary.wiley.com' in absolute_url:
            # = closed journal http://doi.org/10.1111/ele.12585
            # = open journal http://doi.org/10.1111/ele.12587
            if '<iframe' in r.content:
                print u"this is a Wiley 'enhanced PDF' page. took {}s".format(elapsed(start))
                return True

        elif 'ieeexplore' in absolute_url:
            # (this is a good example of one dissem.in misses)
            # = open journal http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6740844
            # = closed journal http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6045214
            if '<frame' in r.content:
                print u"this is a IEEE 'enhanced PDF' page. took {}s".format(elapsed(start))
                return True


        print u"we've decided this ain't a PDF. took {}s".format(elapsed(start))
        return False
Exemplo n.º 10
0
def build_granules(model):
  '''build granules'''
  model.granules = {}
  for gid in model.granule_gids:
    g = mkgranule(gid)    
    model.granules.update({gid : g})
  elapsed('%d granules built'%int(pc.allreduce(len(model.granules),1)))
Exemplo n.º 11
0
def run_ica():

    log('loading data')
    start = util.now()
    voxels, xdim, ydim, zdim = load_data()
    log('  elapsed: {}'.format(util.elapsed(start)))

    log('running independent component analysis')
    start = util.now()
    ica = decomposition.FastICA(n_components=64, max_iter=200)
    sources = ica.fit_transform(voxels)
    sources = to_dataframe(sources, load_subject_ids(),
                           ['X{}'.format(i) for i in range(64)])
    log('  elapsed: {}'.format(util.elapsed(start)))

    log('calculating correlations between voxel and component time courses')
    start = util.now()
    correlations = []
    for voxel in voxels.columns[:32]:
        voxel = voxels[voxel]
        max_correlation = 0
        for source in sources.columns:
            source = sources[source]
            correlation = np.corrcoef(voxel, source)
            if correlation > max_correlation:
                max_correlation = correlation
        correlations.append(max_correlation)
    log('  elapsed: {}'.format(util.elapsed(start)))
Exemplo n.º 12
0
def mk_mitrals(model):
  ''' Create all the mitrals specified by mitral_gids set.'''
  model.mitrals = {}
  for gid in model.mitral_gids:
    m = mkmitral.mkmitral(gid)
    model.mitrals.update({gid : m})
  util.elapsed('%d mitrals created and connections to mitrals determined'%int(pc.allreduce(len(model.mitrals),1)))
Exemplo n.º 13
0
def load_campaign(filename, campaign=None, limit=None):

    with open("data/" + filename, "r") as f:
        lines = f.read().split("\n")
        print "found {} ORCID lines".format(len(lines))

    print len(lines)

    if limit:
        lines = lines[:limit]

    total_start = time()
    row_num = 0
    for line in lines:
        row_num += 1

        # can have # as comments
        if line.startswith("#"):
            print "skipping comment line"
            continue

        loop_start = time()
        email = None

        if "," in line:
            (dirty_orcid, email, twitter) = line.split(",")
        else:
            dirty_orcid = line

        try:
            orcid_id = clean_orcid(dirty_orcid)
        except NoOrcidException:
            try:
                print u"\n\nWARNING: no valid orcid_id in line {}; skipping\n\n".format(
                    line)
            except UnicodeDecodeError:
                print u"\n\nWARNING: no valid orcid_id and line throws UnicodeDecodeError; skipping\n\n"
            continue

        my_person = Person.query.filter_by(orcid_id=orcid_id).first()
        if my_person:
            print u"row {}, already have person {}, skipping".format(
                row_num, orcid_id)
        else:
            print u"row {}, making person {}".format(row_num, orcid_id)
            my_person = make_person(orcid_id, high_priority=False)
            my_person.campaign = campaign
            my_person.email = email
            my_person.twitter = twitter
            db.session.merge(my_person)
            commit_success = safe_commit(db)
            if not commit_success:
                print u"COMMIT fail on {}".format(my_person.orcid_id)

        print "row {}: finished {} in {}s\n".format(row_num, orcid_id,
                                                    elapsed(loop_start))

    print "finished load_campaign on {} profiles in {}s\n".format(
        len(lines), elapsed(total_start))
Exemplo n.º 14
0
def register_blanes(model):
    '''register mitrals'''
    for gid in model.blanes:
        if h.section_exists("soma", model.blanes[gid]):
            s = model.blanes[gid].soma
            pc.set_gid2node(gid, rank)
            pc.cell(gid, h.NetCon(s(1)._ref_v, None, sec=s))
    elapsed('blanes registered')
Exemplo n.º 15
0
def output():
    wtime = h.startsw()
    for vmrec, filename in vmrecordings:
        f = open(filename, 'w')
        for j in range(int(vmrec.size())):
            f.write('%g %g\n' % (tvec.x[j], vmrec.x[j]))
        f.close()
    util.elapsed('vm recorded  write time %.g' % (h.startsw() - wtime))
Exemplo n.º 16
0
def get_package_specific_scenario_data_from_db(package_id):
    timing = []
    section_time = time()

    counter_dict = get_counter_totals_from_db(package_id)

    timing.append(("time from db: counter", elapsed(section_time, 2)))
    section_time = time()

    command = """select citing.issn_l, citing.year::int, sum(num_citations) as num_citations
        from jump_citing citing
        join jump_grid_id institution_grid on citing.grid_id = institution_grid.grid_id
        join jump_account_package institution_package on institution_grid.institution_id = institution_package.institution_id
        where citing.year < 2019 
        and institution_package.package_id='{package_id}'
        and (citing.issn_l in (select distinct issn_l from jump_counter where package_id='{package_id}'))        
        group by citing.issn_l, year""".format(package_id=package_id)
    citation_rows = None
    with get_db_cursor() as cursor:
        cursor.execute(command)
        citation_rows = cursor.fetchall()
    citation_dict = defaultdict(dict)
    for row in citation_rows:
        citation_dict[row["issn_l"]][row["year"]] = round(row["num_citations"])

    timing.append(("time from db: citation_rows", elapsed(section_time, 2)))
    section_time = time()

    command = """
        select authorship.issn_l, authorship.year::int, sum(num_authorships) as num_authorships
        from jump_authorship authorship
        join jump_grid_id institution_grid on authorship.grid_id = institution_grid.grid_id
        join jump_account_package institution_package on institution_grid.institution_id = institution_package.institution_id
        where authorship.year < 2019 
        and institution_package.package_id='{package_id}'
        and (authorship.issn_l in (select distinct issn_l from jump_counter where package_id='{package_id}'))
        group by authorship.issn_l, year""".format(package_id=package_id)
    authorship_rows = None
    with get_db_cursor() as cursor:
        cursor.execute(command)
        authorship_rows = cursor.fetchall()
    authorship_dict = defaultdict(dict)
    for row in authorship_rows:
        authorship_dict[row["issn_l"]][row["year"]] = round(
            row["num_authorships"])

    timing.append(("time from db: authorship_rows", elapsed(section_time, 2)))
    section_time = time()

    data = {
        "timing": timing,
        "counter_dict": counter_dict,
        "citation_dict": citation_dict,
        "authorship_dict": authorship_dict
    }

    return data
Exemplo n.º 17
0
def mk_gconnection_info_part2(model):
    #transfer the gconnection info to the proper rank and make granule_gids set
    model.rank_gconnections = all2all(model.rank_gconnections)
    util.elapsed('rank_gconnections known')
    model.granule_gids = set([
        i[3] for r in model.rank_gconnections
        for i in model.rank_gconnections[r]
    ])
    util.elapsed('granule gids known on each rank')
Exemplo n.º 18
0
def load_campaign(filename, campaign=None, limit=None):

    with open("data/" + filename, "r") as f:
        lines = f.read().split("\n")
        print "found {} ORCID lines".format(len(lines))

    print len(lines)

    if limit:
        lines = lines[:limit]


    total_start = time()
    row_num = 0
    for line in lines:
        row_num += 1

        # can have # as comments
        if line.startswith("#"):
            print "skipping comment line"
            continue

        loop_start = time()
        email = None

        if "," in line:
            (dirty_orcid, email, twitter) = line.split(",")
        else:
            dirty_orcid = line

        try:
            orcid_id = clean_orcid(dirty_orcid)
        except NoOrcidException:
            try:
                print u"\n\nWARNING: no valid orcid_id in line {}; skipping\n\n".format(line)
            except UnicodeDecodeError:
                print u"\n\nWARNING: no valid orcid_id and line throws UnicodeDecodeError; skipping\n\n"
            continue

        my_person = Person.query.filter_by(orcid_id=orcid_id).first()
        if my_person:
            print u"row {}, already have person {}, skipping".format(row_num, orcid_id)
        else:
            print u"row {}, making person {}".format(row_num, orcid_id)
            my_person = make_person(orcid_id, store_in_db=True)
            my_person.campaign = campaign
            my_person.email = email
            my_person.twitter = twitter
            db.session.merge(my_person)
            commit_success = safe_commit(db)
            if not commit_success:
                print u"COMMIT fail on {}".format(my_person.orcid_id)

        print "row {}: finished {} in {}s\n".format(row_num, orcid_id, elapsed(loop_start))

    print "finished load_campaign on {} profiles in {}s\n".format(len(lines), elapsed(total_start))
Exemplo n.º 19
0
def mk_mconnection_info(model):
  r = {}
  GL_to_GCs = {}
  to_conn = []
  cilist = []

  # initialization
  for gid in model.mitrals.keys(): #+model.mtufted.keys():
    r[gid] = params.ranstream(gid, params.stream_latdendconnect) # init rng
    
    glomid = mgid2glom(gid) #params.cellid2glomid(gid) # init GCs connected to GL
    if glomid not in GL_to_GCs:
      GL_to_GCs[glomid] = set() 


  # lateral dendrites positions
  for cellid, cell in model.mitrals.items(): #+model.mtufted.values():
    to_conn += latconn.lateral_connections(cellid, cell)


  ntot_conn = pc.allreduce(len(to_conn),1) # all connections
  
  # connect to granule cells
  it = 0
  while pc.allreduce(len(to_conn), 2) > 0:
    connect2gc(to_conn, r, GL_to_GCs)
    # good connect vs to redo and update GL_to_GCs
    _cilist, to_conn1 = detect_intraglom_conn(to_conn, GL_to_GCs)
    #_cilist, to_conn2 = detect_over_connected_gc(_cilist)
    #to_conn = to_conn1 + to_conn2
    to_conn = to_conn1
    cilist += _cilist
    it += 1

  ntot_conn = pc.allreduce(len(cilist),1)/ntot_conn
  
  # fill the model data
  MCconn = 0
  mTCconn = 0
  for ci in cilist:
    #if params.gid_is_mitral(ci[0]):
    conns = model.mconnections
    MCconn += 1
    #elif params.gid_is_mtufted(ci[0]):
    #  conns = model.mt_connections
    #  mTCconn += 1
      
    if ci[0] not in conns:
      conns[ci[0]] = []
    conns[ci[0]].append(ci)
    
      
  util.elapsed('Mitral %d and mTufted %d cells connection infos. generated (it=%d,err=%.3g%%)'%(int(pc.allreduce(MCconn,1)),\
                                                                                                       int(pc.allreduce(mTCconn,1)),\
                                                                                                       int(pc.allreduce(it,2)),\
                                                                                                       (1-ntot_conn)*100))
Exemplo n.º 20
0
def register_mitrals(model):
    '''register mitrals'''
    for gid in model.mitrals:
        if h.section_exists("soma", model.mitrals[gid]):
            s = model.mitrals[gid].soma
            pc.set_gid2node(gid, rank)
            pc.cell(gid, h.NetCon(s(1)._ref_v, None, sec=s))
            if not mpiece_exists(gid):  # must not be doing multisplit
                wholemitral(gid, model.mitrals[gid])
    elapsed('mitrals registered')
Exemplo n.º 21
0
def register_mitrals(model):
  '''register mitrals'''
  for gid in model.mitrals:
    if h.section_exists("initialseg", model.mitrals[gid]):
      s = model.mitrals[gid].initialseg
      pc.set_gid2node(gid, rank)
      pc.cell(gid, h.NetCon(s(1)._ref_v, None, sec=s))
      if not mpiece_exists(gid): # must not be doing multisplit
        wholemitral(gid, model.mitrals[gid])
  elapsed('mitrals registered')
Exemplo n.º 22
0
    def update_fn(self, cls, method_name, objects, index=1):

        # we are in a fork!  dispose of our engine.
        # will get a new one automatically
        # if is pooling, need to do .dispose() instead
        db.engine.dispose()

        start = time()
        num_obj_rows = len(objects)

        # logger.info(u"{pid} {repr}.{method_name}() got {num_obj_rows} objects in {elapsed} seconds".format(
        #     pid=os.getpid(),
        #     repr=cls.__name__,
        #     method_name=method_name,
        #     num_obj_rows=num_obj_rows,
        #     elapsed=elapsed(start)
        # ))

        for count, obj in enumerate(objects):
            start_time = time()

            if obj is None:
                return None

            method_to_run = getattr(obj, method_name)

            # logger.info(u"***")
            logger.info(u"*** #{count} starting {repr}.{method_name}() method".format(
                count=count + (num_obj_rows*index),
                repr=obj,
                method_name=method_name
            ))

            method_to_run()

            logger.info(u"finished {repr}.{method_name}(). took {elapsed} seconds".format(
                repr=obj,
                method_name=method_name,
                elapsed=elapsed(start_time, 4)
            ))

            # for handling the queue
            if not (method_name == "update" and obj.__class__.__name__ == "Pub"):
                obj.finished = datetime.datetime.utcnow().isoformat()
            # db.session.merge(obj)


        start_time = time()
        commit_success = safe_commit(db)
        if not commit_success:
            logger.info(u"COMMIT fail")
        logger.info(u"commit took {} seconds".format(elapsed(start_time, 2)))
        db.session.remove()  # close connection nicely
        return None  # important for if we use this on RQ
Exemplo n.º 23
0
    def update_fn(self, cls, method_name, objects, index=1):

        # we are in a fork!  dispose of our engine.
        # will get a new one automatically
        # if is pooling, need to do .dispose() instead
        db.engine.dispose()

        start = time()
        num_obj_rows = len(objects)

        # logger.info(u"{pid} {repr}.{method_name}() got {num_obj_rows} objects in {elapsed} seconds".format(
        #     pid=os.getpid(),
        #     repr=cls.__name__,
        #     method_name=method_name,
        #     num_obj_rows=num_obj_rows,
        #     elapsed=elapsed(start)
        # ))

        for count, obj in enumerate(objects):
            start_time = time()

            if obj is None:
                return None

            method_to_run = getattr(obj, method_name)

            # logger.info(u"***")
            logger.info(u"*** #{count} starting {repr}.{method_name}() method".format(
                count=count + (num_obj_rows*index),
                repr=obj,
                method_name=method_name
            ))

            method_to_run()

            logger.info(u"finished {repr}.{method_name}(). took {elapsed} seconds".format(
                repr=obj,
                method_name=method_name,
                elapsed=elapsed(start_time, 4)
            ))

            # for handling the queue
            if not (method_name == "update" and obj.__class__.__name__ == "Pub"):
                obj.finished = datetime.datetime.utcnow().isoformat()
            # db.session.merge(obj)


        start_time = time()
        commit_success = safe_commit(db)
        if not commit_success:
            logger.info(u"COMMIT fail")
        logger.info(u"commit took {} seconds".format(elapsed(start_time, 2)))
        db.session.remove()  # close connection nicely
        return None  # important for if we use this on RQ
Exemplo n.º 24
0
def recompute_journal_metadata():
    journals_raw = JournalsDBRaw.query.all()
    print len(journals_raw)

    new_computed_journals = []

    print "making backups and getting tables ready to run"
    with get_db_cursor() as cursor:
        cursor.execute("drop table journalsdb_raw_bak_yesterday;")
        cursor.execute("drop table journalsdb_computed_bak_yesterday;")
        cursor.execute(
            "create table journalsdb_raw_bak_yesterday as (select * from journalsdb_raw);"
        )
        cursor.execute(
            "create table journalsdb_computed_bak_yesterday as (select * from journalsdb_computed);"
        )

    # do it as its own to force commit
    with get_db_cursor() as cursor:
        # don't truncate raw!  is populated by xplenty.
        # further more truncate hangs, so do truncation this way instead
        cursor.execute("delete from journalsdb_computed;")
    print "tables ready for insertion"

    for journal_raw in journals_raw:
        new_journal_metadata = JournalMetadata(journal_raw)
        new_computed_journals.append(new_journal_metadata)

    print "starting commits"
    start_time = time()
    insert_values_list = [j.get_insert_values() for j in new_computed_journals]
    command_start = u"""INSERT INTO journalsdb_computed ({}) VALUES """.format(
        ",".join(JournalMetadata.get_insert_column_names()))

    with get_db_cursor() as cursor:
        i = 0
        for short_values_list in chunks(insert_values_list, 1000):
            values_list_string = u",".join(short_values_list)
            q = u"{} {};".format(command_start, values_list_string)
            cursor.execute(q)
            i += 1
            print i
    print u"done committing journals, took {} seconds total".format(
        elapsed(start_time))
    print u"now refreshing flat view"

    with get_db_cursor() as cursor:
        cursor.execute("refresh materialized view journalsdb_computed_flat;")
        cursor.execute("analyze journalsdb_computed;")

    print u"done writing to db, took {} seconds total".format(
        elapsed(start_time))
def build_net_round_robin(model, connection_file):
    enter = h.startsw()
    dc.mk_mitrals(model)
    read_mconnection_info(model, connection_file)
    dc.mk_gconnection_info(model)
    model.gids = model.mitral_gids.copy()
    model.gids.update(model.granule_gids)
    register_mitrals(model)
    build_granules(model)
    register_granules(model)
    build_synapses(model)
    elapsed('build_net_round_robin')
    if rank == 0: print "round robin setuptime ", h.startsw() - t_begin
Exemplo n.º 26
0
def build_net_round_robin(model, connection_file):
  enter = h.startsw()
  dc.mk_mitrals(model)
  read_mconnection_info(model, connection_file)
  dc.mk_gconnection_info(model)
  model.gids = model.mitral_gids.copy()
  model.gids.update(model.granule_gids)
  register_mitrals(model)
  build_granules(model)
  register_granules(model)
  build_synapses(model)
  elapsed('build_net_round_robin')
  if rank == 0: print "round robin setuptime ", h.startsw() - t_begin
Exemplo n.º 27
0
def mk_b2g_connections():
    #    gid_blanes_existing = set([x[1] for x in params.glom2blanes ])
    getmodel().blanes2gc_connections.clear()
    elapsed('\t%d granules are generated' %
            pc.allreduce(len(getmodel().granules), 1))
    for ggid, blanes_gid, factor in load_blanes_dic('blanes6.dic'):
        getmodel().blanes2gc_connections.add((ggid, blanes_gid, factor))

#    for ggid in getmodel().granules:
#      for blanes_gid in gid_blanes_existing:
#        getmodel().blanes2gc_connections.add((ggid, blanes_gid))
    elapsed('%d blanes to granule connections generated' %
            pc.allreduce(len(getmodel().blanes2gc_connections), 1))
Exemplo n.º 28
0
def scroll_through_all_dois(query_doi=None, first=None, last=None, today=False, week=False, chunk_size=1000):
    # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service
    headers = {"Accept": "application/json", "User-Agent": "mailto:[email protected]"}

    if first:
        base_url = "https://api.crossref.org/works?filter=from-created-date:{first},until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}"
    else:
        base_url = "https://api.crossref.org/works?filter=until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}"

    next_cursor = "*"
    has_more_responses = True
    number_added = 0

    while has_more_responses:
        has_more_responses = False

        start_time = time()
        url = base_url.format(
            first=first,
            last=last,
            rows=chunk_size,
            next_cursor=next_cursor)
        logger.info(u"calling url: {}".format(url))

        resp = requests.get(url, headers=headers)
        logger.info(u"getting crossref response took {} seconds.  url: {}".format(elapsed(start_time, 2), url))
        if resp.status_code != 200:
            logger.info(u"error in crossref call, status_code = {}".format(resp.status_code))
            return number_added

        resp_data = resp.json()["message"]
        next_cursor = resp_data.get("next-cursor", None)
        if next_cursor:
            next_cursor = quote(next_cursor)
            if resp_data["items"] and len(resp_data["items"]) == chunk_size:
                has_more_responses = True

        dois_from_api = [clean_doi(api_raw["DOI"]) for api_raw in resp_data["items"]]
        added_pubs = add_new_pubs_from_dois(dois_from_api)
        if dois_from_api:
            logger.info(u"got {} dois from api".format(len(dois_from_api)))
        if added_pubs:
            logger.info(u"{}: saved {} new pubs, including {}".format(
                first, len(added_pubs), added_pubs[-2:]))

        number_added += len(added_pubs)

        logger.info(u"loop done in {} seconds".format(elapsed(start_time, 2)))

    return number_added
Exemplo n.º 29
0
def do_a_loop(first=None, last=None, url=None, threads=0, chunk_size=None):
    just_random = True

    loop_start = time()
    es = set_up_elastic(url)

    if just_random:
        random_query_dict["from"] = int(random.random() * 7999)
        results = es.search(index=INDEX_NAME,
                            body=random_query_dict,
                            request_timeout=10000)
    else:
        # different every loop
        query_dict["from"] = int(random.random() * 7999)
        results = es.search(index=INDEX_NAME,
                            body=query_dict,
                            request_timeout=10000)
    # print u"search body:\n{}".format(query)
    print u"took {}s to search ES".format(elapsed(loop_start, 2))
    records_to_save = []

    # decide if should stop looping after this
    if not results["hits"]["hits"]:
        print "no hits!  exiting"
        sys.exit()

    base_results = []
    for base_hit in results["hits"]["hits"]:
        base_hit_doc = base_hit["_source"]
        base_results.append(BaseResult(base_hit_doc))

    scrape_start = time()

    # don't do scrape right now
    # targets = [base_result.scrape_for_fulltext for base_result in base_results]
    # call_targets_in_parallel(targets)
    # print u"scraping {} webpages took {}s".format(len(base_results), elapsed(scrape_start, 2))

    targets = [base_result.set_base1s for base_result in base_results]
    call_targets_in_parallel(targets)

    for base_result in base_results:
        base_result.set_fulltext_urls()
        records_to_save.append(base_result.make_action_record())

    # print "len of records_to_save", len(records_to_save)
    # print "records_to_save:", records_to_save
    save_records_in_es(es, records_to_save, threads, chunk_size)
    print "** took {}s to do {}, {:,} remaining\n".format(
        elapsed(loop_start, 2), len(base_results), results["hits"]["total"])
Exemplo n.º 30
0
def scroll_through_all_dois(query_doi=None, first=None, last=None, today=False, week=False, chunk_size=1000):
    # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service
    headers={"Accept": "application/json", "User-Agent": "mailto:[email protected]"}

    if first:
        base_url = "https://api.crossref.org/works?filter=from-created-date:{first},until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}"
    else:
        base_url = "https://api.crossref.org/works?filter=until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}"

    next_cursor = "*"
    has_more_responses = True
    number_added = 0

    while has_more_responses:
        has_more_responses = False

        start_time = time()
        url = base_url.format(
            first=first,
            last=last,
            rows=chunk_size,
            next_cursor=next_cursor)
        logger.info(u"calling url: {}".format(url))

        resp = requests.get(url, headers=headers)
        logger.info(u"getting crossref response took {} seconds.  url: {}".format(elapsed(start_time, 2), url))
        if resp.status_code != 200:
            logger.info(u"error in crossref call, status_code = {}".format(resp.status_code))
            return number_added

        resp_data = resp.json()["message"]
        next_cursor = resp_data.get("next-cursor", None)
        if next_cursor:
            next_cursor = quote(next_cursor)
            if resp_data["items"] and len(resp_data["items"]) == chunk_size:
                has_more_responses = True

        dois_from_api = [clean_doi(api_raw["DOI"]) for api_raw in resp_data["items"]]
        added_pubs = add_new_pubs_from_dois(dois_from_api)
        if dois_from_api:
            logger.info(u"got {} dois from api".format(len(dois_from_api)))
        if added_pubs:
            logger.info(u"{}: saved {} new pubs, including {}".format(
                first, len(added_pubs), added_pubs[-2:]))

        number_added += len(added_pubs)

        logger.info(u"loop done in {} seconds".format(elapsed(start_time, 2)))

    return number_added
Exemplo n.º 31
0
def mk_gconnection_info(model):
  mk_gconnection_info_part1(model)
  mk_gconnection_info_part2(model)

  # # Save full network Mitral-Granule connections
  # mitral2granule = {}
  # for mgid in model.mitral_gids:
  #     mitral2granule.update({mgid: [gc[3] for gc in model.mconnections[mgid]]})
  #
  # import cPickle as pickle
  # with open('mitral2granule.p', 'wb') as fp:
  #     pickle.dump(mitral2granule, fp)

  util.elapsed('mk_gconnection_info (#granules = %d)'%int(pc.allreduce(len(model.granule_gids),1)))
Exemplo n.º 32
0
def update_fn(cls, method_name, obj_id_list, shortcut_data=None, index=1):

    # we are in a fork!  dispose of our engine.
    # will get a new one automatically
    db.engine.dispose()

    start = time()

    q = db.session.query(cls).options(orm.undefer('*')).filter(cls.id.in_(obj_id_list))

    obj_rows = q.all()
    num_obj_rows = len(obj_rows)
    print "{repr}.{method_name}() got {num_obj_rows} objects in {elapsed}sec".format(
        repr=cls.__name__,
        method_name=method_name,
        num_obj_rows=num_obj_rows,
        elapsed=elapsed(start)
    )

    for count, obj in enumerate(obj_rows):
        start_time = time()

        if obj is None:
            return None

        method_to_run = getattr(obj, method_name)

        print u"\n***\n{count}: starting {repr}.{method_name}() method".format(
            count=count + (num_obj_rows*index),
            repr=obj,
            method_name=method_name
        )

        if shortcut_data:
            method_to_run(shortcut_data)
        else:
            method_to_run()

        print u"finished {repr}.{method_name}(). took {elapsed}sec".format(
            repr=obj,
            method_name=method_name,
            elapsed=elapsed(start_time, 4)
        )

    commit_success = safe_commit(db)
    if not commit_success:
        print u"COMMIT fail"
    db.session.remove()  # close connection nicely
    return None  # important for if we use this on RQ
Exemplo n.º 33
0
def update_fn(cls, method_name, obj_id_list, shortcut_data=None):

    # we are in a fork!  dispose of our engine.
    # will get a new one automatically
    db.engine.dispose()

    start = time()

    q = db.session.query(cls).filter(cls.id.in_(obj_id_list))
    if cls.__name__ == "Person":
        q = q.options(person_load_options())


    obj_rows = q.all()
    num_obj_rows = len(obj_rows)
    print "{repr}.{method_name}() got {num_obj_rows} objects in {elapsed}sec".format(
        repr=cls.__name__,
        method_name=method_name,
        num_obj_rows=num_obj_rows,
        elapsed=elapsed(start)
    )

    for obj in obj_rows:
        start_time = time()

        if obj is None:
            return None

        method_to_run = getattr(obj, method_name)

        print u"\nstarting {repr}.{method_name}() method".format(
           repr=obj,
           method_name=method_name
        )

        if shortcut_data:
            method_to_run(shortcut_data)
        else:
            method_to_run()

        print u"finished {repr}.{method_name}(). took {elapsed}sec".format(
            repr=obj,
            method_name=method_name,
            elapsed=elapsed(start_time, 4)
        )

    db.session.commit()
    db.session.remove()  # close connection nicely
    return None  # important for if we use this on RQ
Exemplo n.º 34
0
    def harvest(self, **kwargs):  # pragma: no cover
        """Make HTTP requests to the OAI server.
        :param kwargs: OAI HTTP parameters.
        :rtype: :class:`sickle.OAIResponse`
        """
        start_time = time()
        for _ in range(self.max_retries):
            if self.http_method == 'GET':
                payload_str = "&".join("%s=%s" % (k, v)
                                       for k, v in kwargs.items())
                url_without_encoding = u"{}?{}".format(self.endpoint,
                                                       payload_str)
                http_response = requests.get(url_without_encoding,
                                             **self.request_args)
                self.http_response_url = http_response.url
            else:
                http_response = requests.post(self.endpoint,
                                              data=kwargs,
                                              **self.request_args)
                self.http_response_url = http_response.url
            if http_response.status_code == 503:
                retry_after = self.RETRY_SECONDS
                logger.info("HTTP 503! Retrying after %d seconds..." %
                            retry_after)
                sleep(retry_after)
            else:
                logger.info("took {} seconds to call pmh url: {}".format(
                    elapsed(start_time), http_response.url))

                http_response.raise_for_status()
                if self.encoding:
                    http_response.encoding = self.encoding
                return OAIResponse(http_response, params=kwargs)
Exemplo n.º 35
0
def get_search_query(query):
    start_time = time()
    my_pubs = fulltext_search_title(query)
    response = [my_pub.to_dict_search() for my_pub in my_pubs]
    sorted_response = sorted(response, key=lambda k: k['score'], reverse=True)
    elapsed_time = elapsed(start_time, 3)
    return jsonify({"results": sorted_response, "elapsed_seconds": elapsed_time})
Exemplo n.º 36
0
def leaderboard():
    filters_dict = make_filters_dict(request.args)
    page_size = request.args.get("page_size", "25")

    start = time()
    num_total, leaders = get_leaders(filters=filters_dict,
                                     page_size=int(page_size))

    leaders_list = [leader.as_snippet for leader in leaders]

    ret_dict = {
        "num_returned": len(leaders_list),
        "num_total": num_total,
        "list": leaders_list,
        "type": filters_dict["type"],
        "filters": filters_dict
    }
    if "tag" in filters_dict:
        tag_obj = Tags.query.filter(
            Tags.unique_tag == filters_dict["tag"]).first()
        ret_dict["related_tags"] = tag_obj.related_tags

    ret = json_resp_from_thing(ret_dict)
    elapsed_time = elapsed(start)
    ret.headers["x-elapsed"] = elapsed_time
    return ret
Exemplo n.º 37
0
def check_pdf_urls(pdf_urls):
    for url in pdf_urls:
        make_transient(url)

    # free up the connection while doing net IO
    safe_commit(db)
    db.engine.dispose()

    req_pool = get_request_pool()

    checked_pdf_urls = req_pool.map(get_pdf_url_status, pdf_urls, chunksize=1)
    req_pool.close()
    req_pool.join()

    row_dicts = [x.__dict__ for x in checked_pdf_urls]
    for row_dict in row_dicts:
        row_dict.pop('_sa_instance_state')

    db.session.bulk_update_mappings(PdfUrl, row_dicts)

    start_time = time()
    commit_success = safe_commit(db)
    if not commit_success:
        logger.info(u"COMMIT fail")
    logger.info(u"commit took {} seconds".format(elapsed(start_time, 2)))
Exemplo n.º 38
0
def get_search_query():
    query = request.args.get("query", None)
    is_oa = request.args.get("is_oa", None)

    if is_oa is not None:
        try:
            is_oa = str_to_bool(is_oa)
        except ValueError:
            if is_oa == 'null':
                is_oa = None
            else:
                abort_json(400, "is_oa must be 'true' or 'false'")

    if not query:
        abort_json(400, "query parameter is required")

    start_time = time()
    response = fulltext_search_title(query, is_oa)
    sorted_response = sorted(response, key=lambda k: k['score'], reverse=True)

    for api_response in sorted_response:
        doi = api_response['response']['doi']
        version_suffix = re.findall(ur'[./](v\d+)$', doi, re.IGNORECASE)

        if version_suffix:
            title = api_response['response']['title']
            title = u'{} ({})'.format(title, version_suffix[0].upper())
            api_response['response']['title'] = title

    elapsed_time = elapsed(start_time, 3)
    return jsonify({"results": sorted_response, "elapsed_seconds": elapsed_time})
Exemplo n.º 39
0
def add_dois_to_queue_from_query(where, job_type):
    logger.info(u"adding all dois, this may take a while")
    start = time()

    table_name = "doi_queue"

    # run_sql(db, "drop table {} cascade".format(table_name(job_type)))
    # create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started, null::text as dyno from crossref)".format(
    #     table_name(job_type))
    create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started from pub);".format(
        table_name)

    if where:
        create_table_command = create_table_command.replace(
            "from pub)", "from pub where {})".format(where))
    run_sql(db, create_table_command)
    create_table_command += """
        alter table {table_name} alter column rand set default random();
        CREATE INDEX {table_name}_id_idx ON {table_name} USING btree (id);
        CREATE INDEX {table_name}_finished_null_rand_idx on {table_name} (rand) where finished is null;
        CREATE INDEX {table_name}_started_null_rand_idx ON {table_name} USING btree (rand, started) WHERE started is null;
        -- from https://lob.com/blog/supercharge-your-postgresql-performance
        -- vacuums and analyzes every ten million rows
        ALTER TABLE {table_name} SET (autovacuum_vacuum_scale_factor = 0.0);
        ALTER TABLE {table_name} SET (autovacuum_vacuum_threshold = 10000000);
        ALTER TABLE {table_name} SET (autovacuum_analyze_scale_factor = 0.0);
        ALTER TABLE {table_name} SET (autovacuum_analyze_threshold = 10000000);
        """.format(table_name=table_name)
    for command in create_table_command.split(";"):
        run_sql(db, command)

    command = """create or replace view export_queue as
     SELECT id AS doi,
        updated AS updated,
        response_jsonb->>'evidence' AS evidence,
        response_jsonb->>'oa_status' AS oa_color,
        response_jsonb->>'free_fulltext_url' AS best_open_url,
        response_jsonb->>'year' AS year,
        response_jsonb->>'found_hybrid' AS found_hybrid,
        response_jsonb->>'found_green' AS found_green,
        response_jsonb->>'error' AS error,
        response_jsonb->>'is_boai_license' AS is_boai_license,
        replace(api->'_source'->>'journal', '
    ', '') AS journal,
        replace(api->'_source'->>'publisher', '
    ', '') AS publisher,
        api->'_source'->>'title' AS title,
        api->'_source'->>'subject' AS subject,
        response_jsonb->>'license' AS license
       FROM pub where id in (select id from {table_name})""".format(
        table_name=table_name(job_type))

    # if job_type:
    #     command_with_hybrid = command.replace("response_jsonb", "response_with_hybrid").replace("export_queue", "export_queue_with_hybrid")
    run_sql(db, command)

    # they are already lowercased
    logger.info(u"add_dois_to_queue_from_query done in {} seconds".format(
        elapsed(start, 1)))
    print_status(job_type)
Exemplo n.º 40
0
def check_pdf_urls(pdf_urls):
    for url in pdf_urls:
        make_transient(url)

    # free up the connection while doing net IO
    safe_commit(db)
    db.engine.dispose()

    req_pool = get_request_pool()

    checked_pdf_urls = req_pool.map(get_pdf_url_status, pdf_urls, chunksize=1)
    req_pool.close()
    req_pool.join()

    row_dicts = [x.__dict__ for x in checked_pdf_urls]
    for row_dict in row_dicts:
        row_dict.pop('_sa_instance_state')

    db.session.bulk_update_mappings(PdfUrl, row_dicts)

    start_time = time()
    commit_success = safe_commit(db)
    if not commit_success:
        logger.info(u"COMMIT fail")
    logger.info(u"commit took {} seconds".format(elapsed(start_time, 2)))
Exemplo n.º 41
0
    def _grep_for_dep_lines(self, query_str, include_globs, exclude_globs):
        arg_list = ['zipgrep', query_str, self.temp_file_name]
        arg_list += include_globs
        arg_list.append("-x")
        arg_list += exclude_globs
        start = time()

        try:
            print "Running zipgrep: '{}'".format(" ".join(arg_list))
            self.dep_lines = subprocess32.check_output(arg_list, timeout=90)

        except subprocess32.CalledProcessError:
            # heroku throws an error here when there are no dep lines to find.
            # but it's fine. there just aren't no lines.
            pass

        except subprocess32.TimeoutExpired:
            # too many files, we'll skip it and move on.
            self.error = "grep_timeout"
            pass

        finally:
            self.grep_elapsed = elapsed(start, 4)
            #print "found these dep lines: {}".format(self.dep_lines)
            print "finished dep lines search in {} sec".format(
                self.grep_elapsed)
Exemplo n.º 42
0
def add_repos_from_remote_csv(csv_url, language):
    start = time()

    print "going to go get file"
    response = requests.get(csv_url, stream=True)
    index = 0

    for github_url in response.iter_lines(chunk_size=1000):
        login, repo_name = login_and_repo_name_from_url(github_url)
        if login and repo_name:
            repo = GithubRepo(
                login=login,
                repo_name=repo_name,
                language=language
            )
            print repo
            db.session.merge(repo)
            index += 1
            if index % 1000 == 0:
                db.session.commit()
                print "flushing on index {index}, elapsed: {elapsed}".format(
                    index=index,
                    elapsed=elapsed(start))

    db.session.commit()
Exemplo n.º 43
0
    def _grep_for_dep_lines(self, query_str, include_globs, exclude_globs):
        arg_list =['zipgrep', query_str, self.temp_file_name]
        arg_list += include_globs
        arg_list.append("-x")
        arg_list += exclude_globs
        start = time()

        try:
            print "Running zipgrep: '{}'".format(" ".join(arg_list))
            self.dep_lines = subprocess32.check_output(
                arg_list,
                timeout=90
            )

        except subprocess32.CalledProcessError:
            # heroku throws an error here when there are no dep lines to find.
            # but it's fine. there just aren't no lines.
            pass

        except subprocess32.TimeoutExpired:
            # too many files, we'll skip it and move on.
            self.error = "grep_timeout"
            pass

        finally:
            self.grep_elapsed = elapsed(start, 4)
            #print "found these dep lines: {}".format(self.dep_lines)
            print "finished dep lines search in {} sec".format(self.grep_elapsed)
Exemplo n.º 44
0
def leaderboard():
    filters_dict = make_filters_dict(request.args)
    page_size = request.args.get("page_size", "25")

    start = time()
    num_total, leaders = get_leaders(
        filters=filters_dict,
        page_size=int(page_size)
    )

    leaders_list = [leader.as_snippet for leader in leaders]

    ret_dict = {
        "num_returned": len(leaders_list),
        "num_total": num_total,
        "list": leaders_list,
        "type": filters_dict["type"],
        "filters": filters_dict
    }
    if "tag" in filters_dict:
        tag_obj = Tags.query.filter(Tags.unique_tag==filters_dict["tag"]).first()
        ret_dict["related_tags"] = tag_obj.related_tags

    ret = json_resp_from_thing(ret_dict)
    elapsed_time = elapsed(start)
    ret.headers["x-elapsed"] = elapsed_time
    return ret
Exemplo n.º 45
0
def log_request(resp):
    if request.endpoint != "get_doi_endpoint":
        return

    logging_start_time = time()

    try:
        results = json.loads(resp.get_data())["results"][0]
    except (ValueError, RuntimeError, KeyError):
        # don't bother logging if no results
        return

    oa_color = results["oa_color"]
    if not oa_color:
        oa_color = "gray"

    body = {
        "timestamp": datetime.utcnow().isoformat(),
        "elapsed": elapsed(g.request_start_time, 2),
        "ip": get_ip(),
        "status_code": resp.status_code,
        "email": request.args.get("email", None),
        "doi": results["doi"],
        "year": results.get("year", None),
        "oa_color": oa_color
    }

    h = {
        "content-type": "text/json",
        "X-Forwarded-For": get_ip()
    }

    url = "http://logs-01.loggly.com/inputs/6470410b-1d7f-4cb2-a625-72d8fa867d61/tag/{}/".format(
        oa_color)
    requests.post(url, headers=h, data=json.dumps(body))
Exemplo n.º 46
0
    def harvest(self, **kwargs):  # pragma: no cover
        """Make HTTP requests to the OAI server.
        :param kwargs: OAI HTTP parameters.
        :rtype: :class:`sickle.OAIResponse`
        """
        start_time = time()
        for _ in range(self.max_retries):
            if self.http_method == 'GET':
                payload_str = "&".join("%s=%s" % (k,v) for k,v in kwargs.items())
                url_without_encoding = u"{}?{}".format(self.endpoint, payload_str)
                http_response = requests.get(url_without_encoding,
                                             **self.request_args)
                self.http_response_url = http_response.url
            else:
                http_response = requests.post(self.endpoint, data=kwargs,
                                              **self.request_args)
                self.http_response_url = http_response.url
            if http_response.status_code == 503:
                retry_after = self.RETRY_SECONDS
                logger.info("HTTP 503! Retrying after %d seconds..." % retry_after)
                sleep(retry_after)
            else:
                logger.info("took {} seconds to call pmh url: {}".format(elapsed(start_time), http_response.url))

                http_response.raise_for_status()
                if self.encoding:
                    http_response.encoding = self.encoding
                return OAIResponse(http_response, params=kwargs)
Exemplo n.º 47
0
    def set_data_for_all_products(self, method_name, high_priority=False, include_products=None):
        start_time = time()
        threads = []

        # use all products unless passed a specific set
        if not include_products:
            include_products = self.all_products

        # start a thread for each product
        for work in include_products:
            method = getattr(work, method_name)
            process = threading.Thread(target=method, args=[high_priority])
            process.start()
            threads.append(process)

        # wait till all work is done
        for process in threads:
            process.join()

        # now go see if any of them had errors
        # need to do it this way because can't catch thread failures; have to check
        # object afterwards instead to see if they logged failures
        for work in include_products:
            if work.error:
                # don't print out doi here because that could cause another bug
                # print u"setting person error; {} for product {}".format(work.error, work.id)
                self.error = work.error

        print u"finished {method_name} on {num} products in {sec}s".format(
            method_name=method_name.upper(),
            num = len(include_products),
            sec = elapsed(start_time, 2)
        )
Exemplo n.º 48
0
    def refresh(self, high_priority=False):
        print u"* refreshing {} ({})".format(self.orcid_id, self.full_name)
        self.error = None
        start_time = time()
        try:
            print u"** calling call_apis"
            self.call_apis(high_priority=high_priority)

            print u"** calling calculate"
            self.calculate()

            print u"** finished refreshing all {num} products for {orcid_id} ({name}) in {sec}s".format(
                orcid_id=self.orcid_id,
                name=self.full_name,
                num=len(self.all_products),
                sec=elapsed(start_time)
            )

        except (KeyboardInterrupt, SystemExit):
            # let these ones through, don't save anything to db
            raise
        except requests.Timeout:
            self.error = "requests timeout"
        except OrcidDoesNotExist:
            self.invalid_orcid = True
            self.error = "invalid orcid"
        except Exception:
            logging.exception("refresh error")
            self.error = "refresh error"
            print u"in generic exception handler, so rolling back in case it is needed"
            db.session.rollback()
        finally:
            self.updated = datetime.datetime.utcnow().isoformat()
            if self.error:
                print u"ERROR refreshing person {}: {}".format(self.id, self.error)
Exemplo n.º 49
0
def add_dois_to_queue_from_query(where, job_type):
    logger.info(u"adding all dois, this may take a while")
    start = time()

    table_name = "doi_queue"

    # run_sql(db, "drop table {} cascade".format(table_name(job_type)))
    # create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started, null::text as dyno from crossref)".format(
    #     table_name(job_type))
    create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started from pub);".format(
        table_name)

    if where:
        create_table_command = create_table_command.replace("from pub)", "from pub where {})".format(where))
    run_sql(db, create_table_command)
    create_table_command += """
        alter table {table_name} alter column rand set default random();
        CREATE INDEX {table_name}_id_idx ON {table_name} USING btree (id);
        CREATE INDEX {table_name}_finished_null_rand_idx on {table_name} (rand) where finished is null;
        CREATE INDEX {table_name}_started_null_rand_idx ON {table_name} USING btree (rand, started) WHERE started is null;
        -- from https://lob.com/blog/supercharge-your-postgresql-performance
        -- vacuums and analyzes every ten million rows
        ALTER TABLE {table_name} SET (autovacuum_vacuum_scale_factor = 0.0);
        ALTER TABLE {table_name} SET (autovacuum_vacuum_threshold = 10000000);
        ALTER TABLE {table_name} SET (autovacuum_analyze_scale_factor = 0.0);
        ALTER TABLE {table_name} SET (autovacuum_analyze_threshold = 10000000);
        """.format(
        table_name=table_name)
    for command in create_table_command.split(";"):
        run_sql(db, command)

    command = """create or replace view export_queue as
     SELECT id AS doi,
        updated AS updated,
        response_jsonb->>'evidence' AS evidence,
        response_jsonb->>'oa_status' AS oa_color,
        response_jsonb->>'free_fulltext_url' AS best_open_url,
        response_jsonb->>'year' AS year,
        response_jsonb->>'found_hybrid' AS found_hybrid,
        response_jsonb->>'found_green' AS found_green,
        response_jsonb->>'error' AS error,
        response_jsonb->>'is_boai_license' AS is_boai_license,
        replace(api->'_source'->>'journal', '
    ', '') AS journal,
        replace(api->'_source'->>'publisher', '
    ', '') AS publisher,
        api->'_source'->>'title' AS title,
        api->'_source'->>'subject' AS subject,
        response_jsonb->>'license' AS license
       FROM pub where id in (select id from {table_name})""".format(
        table_name=table_name(job_type))

    # if job_type:
    #     command_with_hybrid = command.replace("response_jsonb", "response_with_hybrid").replace("export_queue", "export_queue_with_hybrid")
    run_sql(db, command)

    # they are already lowercased
    logger.info(u"add_dois_to_queue_from_query done in {} seconds".format(elapsed(start, 1)))
    print_status(job_type)
Exemplo n.º 50
0
def set_cran_dependencies(login, repo_name):
    start_time = time()
    repo = get_repo(login, repo_name)
    if repo is None:
        return None

    repo.set_cran_dependencies()
    commit_repo(repo)
    print "found deps and committed. took {}sec".format(elapsed(start_time), 4)
    return None  # important that it returns None for RQ
Exemplo n.º 51
0
def set_one_requirements_pypi(login, repo_name):
    start_time = time()
    repo = get_repo(login, repo_name)
    if repo is None:
        return None

    repo.set_requirements_pypi()
    commit_repo(repo)
    print "cleaned requirements, committed. took {}sec".format(elapsed(start_time), 4)
    return None  # important that it returns None for RQ
Exemplo n.º 52
0
def build_synapses(model):
  '''construct reciprocal synapses'''
  model.mgrss = {}
  for r in model.rank_gconnections:
    for ci in model.rank_gconnections[r]:
      rsyn = mgrs.mk_mgrs(*ci[0:7])
      if rsyn:
        model.mgrss.update({rsyn.md_gid : rsyn})
  for mgid in model.mconnections:
    for ci in model.mconnections[mgid]:
      #do not duplicate if already built because granule exists on this process
      if not model.mgrss.has_key(mgrs.mgrs_gid(ci[0], ci[3], ci[6])):
        rsyn = mgrs.mk_mgrs(*ci[0:7])
        if rsyn:
          model.mgrss.update({rsyn.md_gid : rsyn})
  nmultiple = int(pc.allreduce(mgrs.multiple_cnt(), 1))
  if rank == 0:
    print 'nmultiple = ', nmultiple
  detectors = h.List("ThreshDetect")
  elapsed('%d ThreshDetect for reciprocalsynapses constructed'%int(pc.allreduce(detectors.count(),1)))
Exemplo n.º 53
0
def build_net_round_robin(model, connection_file):

  import custom_params
  model.mitral_gids = set(range(0,min(635, custom_params.customMitralCount)))
  model.granule_gids = set(range(max(model.mitral_gids)+1, min(122166, custom_params.customMitralCount*custom_params.customGranulesPerMitralCount)))
  model.gids = model.mitral_gids.union(model.granule_gids)

  enter = h.startsw()
  dc.mk_mitrals(model)
  #return # removing as per M. Migliore's email
  read_mconnection_info(model, connection_file)
  dc.mk_gconnection_info(model)
  model.gids = model.mitral_gids.copy()
  model.gids.update(model.granule_gids)
  register_mitrals(model)
  build_granules(model)
  register_granules(model)
  build_synapses(model)
  elapsed('build_net_round_robin')
  if rank == 0: print "round robin setuptime ", h.startsw() - t_begin
Exemplo n.º 54
0
    def set_cran_dependencies(self):
        """
        using self.dependency_lines, finds all cran libs imported by repo.
        """
        start_time = time()
        self.cran_dependencies = []
        if not self.dep_lines:
            return []

        lines = self.dep_lines.split("\n")
        import_lines = [l.split(":")[1] for l in lines if ":" in l]
        modules_imported = set()
        library_or_require_re = re.compile(ur'(?:library|require)\((.*?)[\)|,|\s]', re.IGNORECASE)


        for line in import_lines:
            for clause in line.split(";"):
                # print u"\nchecking this line: {}".format(clause)
                clean_line = clause.strip()
                clean_line = clean_line.replace("'", "")
                clean_line = clean_line.replace('"', "")
                clean_line = clean_line.replace(' ', "")
                clean_line = clean_line.replace('library.dynam', "library")
                clean_line = clean_line.replace('install.packages', "library")
                clean_line = clean_line.replace('require.package', "require")
                if clean_line.startswith("#"):
                    # print "skipping, is a comment"
                    pass # is a comment
                else:
                    modules = library_or_require_re.findall(clean_line)
                    for module in modules:
                        modules_imported.add(module)
                    if modules:
                        # print "found modules", modules
                        pass
                    else:
                        print "NO MODULES found in ", clean_line 
        print "all modules found:", modules_imported

        self.lib_matches_raw = list(modules_imported)

        from models.package import CranPackage
        matching_cran_packages = set(CranPackage.valid_package_names(modules_imported))

        # print "and here are the ones that match cran!", matching_cran_packages
        # print "*********here are the ones that didn't match", modules_imported - matching_cran_packages
        self.lib_matches_final = list(matching_cran_packages)

        print "done finding cran deps for {}: {} (took {}sec)".format(
            self.full_name,
            self.lib_matches_final,
            elapsed(start_time, 4)
        )
        return self.lib_matches_final
Exemplo n.º 55
0
    def call_local_lookup_oa(self, limit_to_products=None):
        start_time = time()

        if limit_to_products:
            products = limit_to_products
        else:
            products = self.products

        for p in products:
            p.set_local_lookup_oa()
        print u"finished local step of set_fulltext_urls in {}s".format(elapsed(start_time, 2))
Exemplo n.º 56
0
def main(fn, optional_args=None):
    start = time()

    # call function by its name in this module, with all args :)
    # http://stackoverflow.com/a/4605/596939
    if optional_args:
        globals()[fn](*optional_args)
    else:
        globals()[fn]()

    print "total time to run:", elapsed(start)
Exemplo n.º 57
0
def set_pypi_in_formal_only(login, repo_name):
    print "working on ", login, repo_name
    start_time = time()
    repo = get_repo(login, repo_name)
    if repo is None:
        return None

    repo.set_pypi_in_formal_only()

    commit_repo(repo)
    print "calculated pypi_in_formal_only, committed. took {}sec".format(elapsed(start_time), 4)
    return None  # important that it returns None for RQ
Exemplo n.º 58
0
def http_get(url,
             headers={},
             read_timeout=60,
             connect_timeout=60,
             stream=False,
             cache_enabled=False,
             allow_redirects=True,
             publisher=None,
             session_id=None,
             ask_slowly=False):

    start_time = time()

    # reset
    os.environ["HTTP_PROXY"] = ""

    try:
        logger.info(u"LIVE GET on {}".format(url))
    except UnicodeDecodeError:
        logger.info(u"LIVE GET on an url that throws UnicodeDecodeError")

    max_tries = 2
    if ask_slowly:
        max_tries = 3
    success = False
    tries = 0
    r = None
    while not success:
        try:
            r = call_requests_get(url,
                                  headers=headers,
                                  read_timeout=read_timeout,
                                  connect_timeout=connect_timeout,
                                  stream=stream,
                                  publisher=publisher,
                                  session_id=session_id,
                                  ask_slowly=ask_slowly)
            success = True
        except (KeyboardInterrupt, SystemError, SystemExit):
            raise
        except Exception as e:
            # don't make this an exception log for now
            logger.info(u"exception in call_requests_get")
            tries += 1
            if tries >= max_tries:
                logger.info(u"in http_get, tried too many times, giving up")
                raise
            else:
                logger.info(u"in http_get, got an exception, trying again")
        finally:
            logger.info(u"finished http_get for {} in {} seconds".format(url, elapsed(start_time, 2)))

    return r