def test_everseen(self): """ensure duplicate elements are ignored""" u = mi.unique_everseen('AAAABBBBCCDAABBB') self.assertEqual( ['A', 'B', 'C', 'D'], list(u) )
def wordNetNER(document): plant_sns = (wn.synsets('plant', pos="n")) plant = plant_sns[1] #(botany) a living organism lacking the power of locomotion #hardcoded wordnet_names = [] #wordnet_lemmatizer = WordNetLemmatizer ##Lematizer doesn't work.... for word in document: #word = wordnet_lemmatizer.lemmatize(word) ##Lematizer doesn't work... mySynsets = wn.synsets(word, pos="n") i = 0 for i in range(0, 3): try: given_word = mySynsets[i] #tries first 3 synsets definition = (given_word.definition()) p1 = re.compile('plant(s?)\s') p2 = re.compile('organism(s?)\s') p3 = re.compile('animal(s?)\s') match1 = p1.search(definition) match2 = p2.search(definition) match3 = p3.search(definition) if match1 or match2 or match3: #if the given word has "plants" or "animals" in the def, check to see how similar it is to "plant" similarity_score = (given_word.path_similarity(plant)) #check similarity score if similarity_score >= 0.2: #print(similarity_score) #print ("The words: "+(str(given_word)) + " has a sim score of: " +str(similarity_score)) wordnet_names.append(word) named_entities.append(word) #hypernym = given_word.hypernyms() #hypernym is list #synset 'organism' exists #can't search in the hypernyms....hmm... i += 1 except IndexError: pass wordnet_ner = (list(unique_everseen(wordnet_names))) return wordnet_ner
def fetch_skill_api(): list_of_files = ['pre_6.txt'] #list_of_letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' '] for i in list_of_files: master_skills_list = [] i_path_join = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "static", "data", i) with open(i_path_join, 'r') as fp: contents = fp.readlines() #print contents for line in contents: print line master_skills_list.extend(autosuggest_api(line)) print master_skills_list c = list(unique_everseen(master_skills_list)) ioutput_path_join = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "static", "data", i + 'out') with open(ioutput_path_join, 'w') as fp: for item in c: fp.write("%s\n" % item)
def interpret(atom_index, grammar, user_input): stripped_input = strip_punctuation(user_input) base_atoms_raw = atom_index.query(stripped_input) base_atoms = [] for a in base_atoms_raw: base_atoms.append(a.clone_nonstopword()) stop_words = filter(lambda a: a.get_stopword(), grammar.get_atoms()) extra_atoms = [] for stop_word in stop_words: if not stop_word in base_atoms: extra_atoms.append(stop_word) expanded_atoms = base_atoms + extra_atoms result = [] for query_pattern in grammar.get_query_patterns(): phrases = generate_phrases(grammar, expanded_atoms) queries = query_pattern.resolve(phrases) for query in queries: result.append(query) result = filter(lambda q: q.get_score() > 0, result) result = filter(lambda q: q.validate(base_atoms), result) result = list(unique_everseen(result, lambda q: q.get_english())) result.sort(key=lambda q: q.get_score()) result.reverse() for q in result: q.set_base_atoms(base_atoms) return result
def create_a_pb_unit_cell(self, fpb_prop, uc_name ): ''' fpb_prop: a tuple contains: fuel_temps: temperature list for unique pebbles in the unit cell a matrix of unique pebbles x n layers of fuel in a triso coating_temps: a list that contains temp for each of the non-fuel layers in triso, e.g. 4x5 cgt: central graphite temperature sht: shell temperature burnups: a list of 14 burnups uc_name: unit cell name ''' fuel_temps, coating_temps, cgt, sht, uc_name, burnups, pb_comp_dir = fpb_prop fpb_list = [] unique_fpb_list = {} unique_burnups = list(unique_everseen(burnups)) unique_burnup_nb = len(unique_burnups) assert fuel_temps.shape[0] == unique_burnup_nb, 'wrong dimension %s' %str(fuel_temps.shape) assert coating_temps.shape[0] == unique_burnup_nb, 'wrong dimension' # create a list of unique pebbles for i, bu in enumerate(unique_burnups): pb_name = 'pb%s%d' % (uc_name, bu) unique_fpb_list[bu] = self.create_a_fuel_pebble(fuel_temps[bu-1, :], coating_temps[unique_burnups[i]-1, :], cgt, sht, pb_name, unique_burnups[i], pb_comp_dir) # create a list of all the 14 fuel pebbles, some of them are exactly the same for bu in burnups: fpb_list.append(unique_fpb_list[bu]) return fpb_list
def kth_smallest_elem(node, k): ls = inorder_traversal(node) # unique_everseen removes duplicates from a list in O(N) time accorinding # to # http://stackoverflow.com/questions/480214/how-do-you-remove-duplicates-from-a-list-in-python-whilst-preserving-order result = list(unique_everseen(ls)) return result[k - 1]
def ref_insert_line(line, form): """ Inserts \ref{} functions into tex files by substituting for a regex. :param line: a line from a tex file, in the form of a string, which we want to process and insert references into :param form: a regular expression specification for a string to be replaced. :return: a string, wherein all of the strings specified by form are replaced by \ref{form} """ # lineIterator = form.search(line) # searchAndSub = [] # lineNew = line # while searchAndSub is not None: # searchAndSub = form.search(line) # lineNew = lineNew.replace() searchresults = form.findall(line) iterableStrings = list(unique_everseen(searchresults)) lineNew = line for substring in iterableStrings: lineNew = lineNew.replace(substring, r'(\ref{' + substring[1:-1] + r'})') return lineNew
def get_tree(self, node=None, filtered_ids=[]): """ node - головной объект, от которого строим дерево, если он не указан, то строим дерево из всех объектов. """ def get_descendants(node): descendents = [] children = node.get_children() children.filter(pk__in=filtered_ids) for n in children: n_descendents = get_descendants(n) n_descendents = [n for n in n_descendents if n.pk in filtered_ids] descendents += n_descendents return [node] + descendents if node: tree = get_descendants(node) else: tree = [] lev1_pages = self.filter(level=1) if filtered_ids: lev1_pages = self.filter(pk__in=filtered_ids) for node in lev1_pages: tree += get_descendants(node) from more_itertools import unique_everseen tree = list(unique_everseen(tree)) return tree
def main(): names = [] types = ['Grass', 'Poison', 'Fire', 'Dragon', 'Flying', 'Water', 'Bug', 'Normal', 'Electric', 'Ground', 'Fairy', 'Fighting', 'Psychic', 'Rock', 'Steel', 'Ice', 'Ghost', 'Dark'] scrapeNames(names, types) pokemon = { 'images': {}, 'heights': {}, 'weights': {}, 'powers': {}, 'names': [] } scrapePowers(pokemon, names) scrapeHeightsWeights(pokemon, names) names = list(unique_everseen(names)) pokemon['names'] = names scrapeImages(pokemon, names) fileName = 'stats/stats.json' file = open(fileName, 'w') json.dump(pokemon, file, indent = 2) file.close()
def _countEntities(cols): for i in range(len(cols)): entries = cols[i]["entries"] entityCount = 0 multipleEnts = False for entry in entries: entrySoup = BeautifulSoup(entry, "lxml") links = entrySoup.findAll("a") linksHref = [aTag["href"] for aTag in links] linksHref = list(unique_everseen(linksHref)) linksCount = len(links) for link in links: # Image werden nicht gezählt if link.find("img") != None: linksCount -= 1 # Ebenso dürfen es nur Wikipedia-interne Links sein elif link["href"][0:5] != "/wiki": linksCount -= 1 # Manche Tabellen setzen doppelte Verlinkunden (e.g. TableID = 513, List of Olympic medalists in basketball) elif link["href"] in linksHref: linksHref.remove(link["href"]) # Beim ersten Mal löschen elif link["href"] not in linksHref: linksCount -= 1 # Bei jedem weiteren Mal, die Anzahl korrigieren if linksCount > 0: entityCount += 1 if linksCount > 1: multipleEnts = True # Bewertung: Maximal 50 Punkte möglich (100% sind Entitäten) cols[i]["rating"] = int(math.floor(MAX_ENTITIES_POINTS * (entityCount / len(entries)))) cols[i]["entityCount"] = entityCount cols[i]["multipleEntities"] = multipleEnts
def keys(self, pattern): """Aggregated keys method.""" def _keys(node, pattern): for result in node.keys(pattern): self._output_queue.put(result) results = self._runner(_keys, pattern) # return list(OrderedDict.fromkeys(results)) return sorted(list(unique_everseen(results)))
def clash_table(self,i,j): units = []; units.extend(self.row(i)); units.extend(self.col(j)); units.extend(self.box(which_box(i,j))); units = list(unique_everseen(units)); values = list(chain(*[ list(unit.available_values) for unit in units ])); return(table(values));
def __init__(self, data=np.asarray([[0, 0]]), cls_label=np.asarray([0]), ses_label=np.asarray([0]), buff_size=BUFF_SIZE, n_components=(K_CLS, K_SES, K_RES), beta=BETA, NMF_updates='beta', n_iter=N_ITER, lambdas=[0, 0, 0], normalize=False, fixed_factors=None, verbose=0, dist_mode='segment',Wn=None): self.data_shape = data.shape self.buff_size = np.min((buff_size, data.shape[0])) self.n_components = np.asarray(n_components, dtype='int32') self.beta = theano.shared(np.asarray(beta, theano.config.floatX), name="beta") self.verbose = verbose self.normalize = normalize self.lambdas = np.asarray(lambdas, dtype=theano.config.floatX) self.n_iter = n_iter self.NMF_updates = NMF_updates self.iters = {} self.scores = [] self.dist_mode = dist_mode if fixed_factors is None: fixed_factors = [] self.fixed_factors = fixed_factors fact_ = np.asarray([base.nnrandn((self.data_shape[1], np.sum(self.n_components))) for i in more_itertools.unique_everseen(itertools.izip(cls_label, ses_label))]) self.W = theano.shared(fact_.astype(theano.config.floatX), name="W", borrow=True, allow_downcast=True) fact_ = np.asarray(base.nnrandn((self.data_shape[0], np.sum(self.n_components)))) self.H = theano.shared(fact_.astype(theano.config.floatX), name="H", borrow=True, allow_downcast=True) self.factors_ = [self.H, self.W] if Wn is not None: self.Wn = Wn self.X_buff = theano.shared(np.zeros((self.buff_size, self.data_shape[1])).astype(theano.config.floatX), name="X_buff") if (self.NMF_updates == 'groupNMF') & (self.dist_mode == 'iter'): self.cls_sums = theano.shared(np.zeros((np.max(cls_label)+1, self.data_shape[1], self.n_components[0]) ).astype(theano.config.floatX), name="cls_sums", borrow=True, allow_downcast=True) self.ses_sums = theano.shared(np.zeros((np.max(ses_label)+1, self.data_shape[1], self.n_components[1]) ).astype(theano.config.floatX), name="ses_sums", borrow=True, allow_downcast=True) self.get_sum_function() self.get_updates_functions() self.get_norm_function() self.get_div_function()
def getLessonList(courselink, quality): global session coursehtml = (session.get(courselink)).text lessonLinkRegex = re.compile('https?://www.cybrary.it/video/\w+(?:-[\w]+)*/') matchedLessonLink = list(unique_everseen(lessonLinkRegex.findall(coursehtml))) for link in matchedLessonLink: print "Downloading "+link downloadVideos(getVideoLink(link), quality)
def reorder_cls_ses(data, cls, ses, with_index=False): """reorder the data such that there is only one continuous bloc for each pair class/session Parameters ---------- data : array the data cls : array the class labels for the data ses : array the session label for the data with_index : Boolean (default False) if True, the function returns the reordered indexes together with data and labels Returns ------- data : array with the same shape as data reordered data cls : array with the same shape as cls reordered class labels ses : array with the same shape as ses reordered session labels ind : array with the same shape as data.shape[1] reordered indexes (only if with_index==True) """ data_ordered = np.zeros((data.shape)) cls_ordered = np.zeros((cls.shape)) ses_ordered = np.zeros((ses.shape)) if with_index: index = np.arange((data.shape[1],)) index_ordered = np.zeros((index.shape)) data_fill = 0 for i in more_itertools.unique_everseen(itertools.izip(cls, ses)): ind = np.where((cls == i[0]) & (ses == i[1]))[0] bloc_length = data[(cls == i[0]) & (ses == i[1]), :].shape[0] data_ordered[data_fill:data_fill+bloc_length, ] = data[ind, :] cls_ordered[data_fill:data_fill+bloc_length] = cls[ind] ses_ordered[data_fill:data_fill+bloc_length] = ses[ind] if with_index: index_ordered[data_fill:data_fill+bloc_length] = index[ind] data_fill += bloc_length if with_index: return { 'data': data_ordered, 'cls': cls_ordered, 'ses': ses_ordered, 'ind': index_ordered} else: return { 'data': data_ordered, 'cls': cls_ordered, 'ses': ses_ordered}
def onlymonths(wordfile,months): if len(wordfile) == months: return map(lambda x:x.split("\n")[0],wordfile) splt = map(lambda x: x.split(","),wordfile) startdate = map(lambda x: x[0],splt) times = map(lambda x: int(x[1]),splt) times = mergedate(startdate,times) startdate = list(unique_everseen(startdate)) wordfile = map(lambda x:x[0]+ "," + str(x[1]),zip(startdate,times)) return wordfile
def bootstrap(region, environment, domain, instance_type, coreos_channel, coreos_version, available, flotilla_container): coreos = CoreOsAmiIndex() cloudformation = FlotillaCloudFormation(environment, domain, coreos) region_meta = RegionMetadata(environment) regions = [r for r in unique_everseen(region)] cloudformation.tables(regions) region_params = region_meta.store_regions(regions, available, instance_type, coreos_channel, coreos_version, flotilla_container) cloudformation.schedulers(region_params) logger.info('Bootstrap complete')
def group_runs(data_dict): """Find unique model/physics groups""" all_info = data_dict.keys() model_physics_list = [] for key, group in groupby(all_info, lambda x: x[0:2]): model_physics_list.append(key) family_list = list(unique_everseen(model_physics_list)) return family_list
def matchOrganisms2(bigrams): latin_names = [] for grams in bigrams: (w1, w2) = grams.split(" ") p1 = re.compile('[a-z]{2,}[b-df-hj-np-tv-z]{1,}(i$|is$|ia$|ae$|um$|us$|es$|arum$)') #wont pick up ones that match in 'a'; too many false positives match1 = p1.search(w1) p2 = re.compile('[a-z]{2,}[b-df-hj-np-tv-z]{1,}(a$|i$|is$|ia$|ae$|um$|us$|es$|arum$)') #picks up the word 'genes' match2 = p2.search(w2) if match1 and match2: latin_names.append(grams) named_entities.append(grams) return (list(unique_everseen(latin_names)))
def filters_mapping_content(exact_results: List, similarity_results: List, threshold: float) -> Tuple: """Parses compiled mapping results, when results exist, to determine a final aggregated mapping result. Args: exact_results: A nested list containing 3 sub-lists where sub-list[0] contains exact match uris, sub-list[1] contains exact match labels, and sub-list[2] contains exact match evidence. similarity_results: A nested list containing 3 sub-lists where sub-list[0] contains similarity uris, sub-list[1] contains similarity labels, and sub-list[2] contains similarity evidence. threshold: A float that specifies a cut-off for filtering cosine similarity results. Returns: A tuple of lists containing mapping results for a given row. The first tuple contains exact mapping results and the second contains similarity results. Both lists contains 3 items: uris, labels, and evidence. """ exact_uri, exact_label, exact_evid = exact_results sim_uri, sim_label, sim_evid = similarity_results exact_result: Optional[List[Any]] = [None, None, None] sim_result: Optional[List[Any]] = [None, None, None] # format results if exact_uri: exact_result = [list(unique_everseen(exact_uri)), list(unique_everseen(exact_label)), ' | '.join(exact_evid)] if sim_uri: if any(x for x in sim_evid[0].split(' | ') if float(x.split('_')[-1]) == 1.0): evid_list = sim_evid[0].split(' | ') sim_keep = [evid_list.index(x) for x in evid_list if float(x.split('_')[-1]) == 1.0] uris, labels = [sim_uri[x] for x in sim_keep], [sim_label[x] for x in sim_keep] sim_result = [uris, labels, ' | '.join([evid_list[x] for x in sim_keep])] elif any(x for x in sim_evid[0].split(' | ') if float(x.split('_')[-1]) >= threshold): evid_list = sim_evid[0].split(' | ') sim_keep = [evid_list.index(x) for x in evid_list if float(x.split('_')[-1]) >= threshold] uris, labels = [sim_uri[x] for x in sim_keep], [sim_label[x] for x in sim_keep] sim_result = [uris, labels, ' | '.join([evid_list[x] for x in sim_keep])] else: sim_result = [sim_uri, sim_label, ' | '.join(sim_evid)] return exact_result, sim_result
def evidence_writer(evidences, sentence_id, data_source, resource_v, rule_predicates, rule_type): item_set = OrderedSet() entity_set = [] # print rule_predicates for evidence in evidences: # print evidence[1] if evidence[1] in rule_predicates: if evidence[0] == resource_v[0] and evidence[2] == resource_v[ 1] and evidence[1] == data_source: pass elif evidence[0] == resource_v[1] and evidence[2] == resource_v[ 0] and evidence[1] in ["keyPerson", "capital"]: pass else: try: if '"' not in evidence[0] and '"' not in evidence[2]: if ':' not in evidence[0] and ':' not in evidence[2]: if '#' not in evidence[0] and '#' not in evidence[ 2]: if '&' not in evidence[ 0] and '&' not in evidence[2]: if '=' not in evidence[ 0] and '=' not in evidence[2]: if ' ' not in evidence[ 0] and ' ' not in evidence[2]: entity_1 = '"' + evidence[0] + '"' entity_2 = '"' + evidence[2] + '"' item_set.add(evidence[1] + '(' + entity_1 + ',' + entity_2 + ').') except: pass else: pass # print "here" # print item_set with open(evidence_path + str(sentence_id) + '_.txt', 'wb') as csvfile: for i in item_set: if '*' not in i: try: # print i csvfile.write(i.encode('utf-8') + '\n') # csvfile.write(i+'\n') except: pass with open(evidence_path + str(sentence_id) + '_.txt', 'r') as f, \ open(evidence_path + str(sentence_id) + '_unique.txt', 'wb') as out_file: out_file.writelines(unique_everseen(f)) remove_file = evidence_path + str(sentence_id) + '_.txt' os.remove(remove_file) return item_set, entity_set
def main(ifilename): file_name = ifilename file_name_wo_end = file_name[:-4] f = open(file_name) file_content = f.read() m3 = re.findall('xmlUrl=\"(.+?)\"', file_content) m4 = re.findall('text=\"(.+?)\"', file_content) m4.pop(0) result = [] for i in range(len(m4)): print(m4[i], m3[i]) result.append((m4[i], m3[i])) rss_dict = {} for name, url in tqdm(result): try: r = requests.get( url, headers={ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0' }, timeout=15) except: print("failed: " + url) rss_dict[name] = r.text tqdm.write('Downloaded ' + url + '\n') json_filename = file_name_wo_end + ".json" with open(json_filename, 'w') as outfile: json.dump(rss_dict, outfile) all_urls_3 = [] for content in rss_dict.values(): m = re.findall('\"(http\S+?\.(?:mp3|mp4))[\"\?]', content, re.IGNORECASE) if m: for item in m: all_urls_3.append(item) txt_filename = file_name_wo_end + ".txt" f = open(txt_filename, 'w') unique_urls = list(unique_everseen(all_urls_3)) for url in unique_urls: f.write(url) f.write("\n") f.close()
def get_most_informative_features(self, n=50): # Determine the most relevant features, and display them. cpdist = self.classifier._feature_probdist Feats = [] for (fname, fval) in self.classifier.most_informative_features(n+5): def labelprob(l): return cpdist[l, fname].prob(fval) labels = sorted( [l for l in self.classifier._labels if fval in cpdist[l, fname].samples()], key=labelprob, ) if len(labels) == 1: continue l0 = labels[0] l1 = labels[-1] Feats.append((fname, fval, l1, l0)) #return Feats def format_double_feat(feature): if feature.startswith("("): feature = re.findall(r"\(u?[\"\'](.+)[\"\'], u?[\"\'](.+)[\"\']\)",feature)[0] if feature[0]=="<START>": return "<span class='innerFeature'>%s</span> at the beginning" % feature[1] elif feature[1]=="<END>": return "<span class='innerFeature'>%s</span> at the end" % feature[0] else: feature = "<span class='innerFeature'>%s</span> followed by <span class='innerFeature'>%s</span>" % (feature[0],feature[1]) return str(feature) return "<span class='innerFeature'>%s</span>" % feature def format_feat(fname,fval,l1,l0): if l1=="good": if fval==None: return "do not contain %s" % (format_double_feat(fname)) else: return "contain %s" % (format_double_feat(fname)) elif l1=="bad": if fval==None: return "contains %s" % (format_double_feat(fname)) else: return "do not contain %s" % (format_double_feat(fname)) formatted_features = [format_feat(*f) for f in Feats] return list(more_itertools.unique_everseen(formatted_features))[:20]
def stim_select_callback(attr, old, new, kwargs=plot_kwargs): # Values from widget are not exact stimuli names stim = get_filename(META, stim_select.value) image_cds.data = get_img(stim).data print('got_here') # remove current gaze plot remove_glyphs(image_plot, GAZE_COLORS) x_dim = int(META[stim]['x_dim']) y_dim = int(META[stim]['y_dim']) station_count = META[stim]['station_count'] city = META[stim]['txt_name'] # Grab old title, set new text t = matrix_plot.title t.text = stim_select.value + ' - (' + str(station_count) + ' stations)' # Retaining other settings color = color_select.value metric = METRICS.get(metric_select.value) matrix_cds.data = get_matrix_cds(stim, USERS, DF, color, metric).data # Yields unique 'xname's, preserving order if PRESENTING: order = list(unique_everseen(matrix_cds.data['xname'])) matrix_plot.x_range.factors = order matrix_plot.y_range.factors = list(reversed(order)) image_plot.x_range.start = 0 image_plot.y_range.start = 0 image_plot.x_range.end = x_dim image_plot.y_range.end = y_dim plot_w = x_dim + kwargs['min_border_left'] + kwargs['min_border_right'] plot_h = y_dim + kwargs['min_border_top'] + kwargs['min_border_bottom'] X = [ item for sublist in matrix_cds.data['MappedFixationPointX'] for item in sublist ] Y = [ item for sublist in matrix_cds.data['MappedFixationPointY'] for item in sublist ] duration = [ item for sublist in matrix_cds.data['FixationDuration'] for item in sublist ] fixation_cds.data = get_fixation_points(X, Y, duration).data
class JusteatSpider(scrapy.Spider): name = "justeat" allowed_domains = ["just-eat.co.uk"] # download the restuarant_urls.txt from justeat1 spider output with open("restuarant_urls.txt") as f: restuarant_urls = [x.strip('\n') for x in f.readlines()] restuarant_urls = list(unique_everseen(restuarant_urls)) # we need to run this in stages, about 7000 restuarants at a time max. print("total number of restaurants: " + str(len(restuarant_urls))) print("first record: " + str(restuarant_urls[0])) print("8000th record: " + str(restuarant_urls[7999])) print("last record: " + str(restuarant_urls[len(restuarant_urls) - 1])) while True: to_do = input("Enter Stage 1 to X (max 6 for now) ") if to_do == "1" or to_do == "2" or to_do == "3" or to_do == "4" or to_do == "5" or to_do == "6": break # build a list of urls to scrap start_urls = [] startrange = 7000 * (int(to_do) - 1) endrange = 7000 * (int(to_do)) if endrange > len(restuarant_urls): endrange = len(restuarant_urls) for a in range(startrange, endrange): start_urls.append("https://www.just-eat.co.uk" + str(restuarant_urls[a])) def parse(self, response): for sel in response.xpath('//div[@class="restaurantParts"]'): item = JusteatItem() item['name'] = sel.xpath('//*[@itemprop="name"]/text()').extract() item['address'] = sel.xpath( '//*[@itemprop="address"]/span/text()').extract() item['cuisine'] = sel.xpath( './/div/p/span[@itemprop="servesCuisine"]/text()').extract() item['ratingvalue'] = sel.xpath( './/div/p[@itemprop="aggregateRating"]/meta[@itemprop="ratingValue"]/@content' ).extract() item['ratingcount'] = sel.xpath( './/div/p[@itemprop="aggregateRating"]/meta[@itemprop="ratingCount"]/@content' ).extract() item['ratingbest'] = sel.xpath( './/div/p[@itemprop="aggregateRating"]/meta[@itemprop="bestRating"]/@content' ).extract() item['ratingworst'] = sel.xpath( './/div/p[@itemprop="aggregateRating"]/meta[@itemprop="worstRating"]/@content' ).extract() item['name_id'] = sel.xpath('.//@data-restaurant-id').extract() yield item
def polygon_2_vtx(starting_vertex, edges_to_visit): from more_itertools import unique_everseen if not edges_to_visit: return closed = False print("Edges to visit:", edges_to_visit) subpolygon = [] found_vertex = starting_vertex while not closed: for index, edge in enumerate(edges_to_visit.copy()): visiting_vertex = found_vertex #if visiting_vertex not in set(edge) and index==len(edges_to_visit.copy()): # Tracer()() #print("Not found in list of edges") #closed=True #break if visiting_vertex not in set(edge): continue subpolygon.append(visiting_vertex) print("Visiting vertex", visiting_vertex) found_starting_vtx = False subpolygon.append(found_vertex) print(visiting_vertex, " in ", edge) for index in set(edge): if visiting_vertex != index: found_vertex = index print("Found vertex:", found_vertex) subpolygon.append(found_vertex) print("Removing edge", edge) edges_to_visit.discard(edge) print(edges_to_visit) if found_vertex == starting_vertex: subpolygon = list(unique_everseen(subpolygon)) print("Back to starting vertex") closed = True break if len(subpolygon) <= 3: return else: return subpolygon
def get_level_order(bms_table: dict) -> List[str]: """ 表データからレベル順を得る。 :param bms_table: 表データ :return: level_order """ # ひとまずデータ部に出現するレベルを出現順に列挙 level_order_from_data = list(unique_everseen([chart["level"] for chart in bms_table["data"]])) # ヘッダ部にlevel_orderの指定がなかったら、データ部から得たレベル順をそのまま使う (仕様) if "level_order" not in bms_table["header"]: return level_order_from_data # ヘッダ部にlevel_orderの指定があったらそれを使う (仕様) level_order = bms_table["header"]["level_order"] # のだが、データ部との不整合が起こらないように以下のような処理を行っておく (独自解釈) level_order = map(str, level_order) # データ部に合わせてstrにしておく (なぜか仕様がUnion[str, int]である) level_order = list(unique_everseen(level_order)) # 重複があったら除去しておく omissions = [level for level in level_order if level not in level_order_from_data] # 抜けがあったら…… return level_order + omissions # 後ろにつけておく
def list_load_balancers_external_subnets(self): client = self.session.client('elbv2') lb_list_external = [] for lb in client.describe_load_balancers()['LoadBalancers']: if lb['Scheme'] == 'internet-facing': lb_list_external.append(lb) if not lb_list_external: print(colored("[WARNING] No Public Load Balancers detected", "red")) sub_list = [] for i in lb_list_external: for k in i['AvailabilityZones']: sub_list.append(k['SubnetId']) return list(unique_everseen(sub_list))
def working(self): """A list of the current tests with warning or errors. A device is working if the list is empty. This property returns, for the last test performed of each type, the one with the worst ``severity`` of them, or ``None`` if no test has been executed. """ from ereuse_devicehub.resources.event.models import Test current_tests = unique_everseen( (e for e in reversed(self.events) if isinstance(e, Test)), key=attrgetter('type')) # last test of each type return self._warning_events(current_tests)
def add_to_udl(self, new_udl): """ Adds more dictionaries to the udl. Also sorts them and removes double entries. Doesn't add dictionaries with confirmed wrong urls or levels. """ new_udl = [ud for ud in new_udl if ud["URL"] not in self.wrong_urls] if self.confirmed_level is not None: new_udl = [ ud for ud in new_udl if ud["Level"] == self.confirmed_level ] self.udl = new_udl + self.udl self.udl = list(unique_everseen(self.udl)) self.udl = sorted(self.udl, key=lambda k: k['Score'], reverse=True)
def getPods(): Initconnection.Initconnection.loadConfig() apiV1 = client.CoreV1Api() Logging.Logging.log('Retrieving pods') ret = apiV1.list_pod_for_all_namespaces(watch=False) output = [] for pod in ret.items: if pod is None: Logging.Logging.log('Pods were not found') else: output.append(pod.metadata.name) return list(unique_everseen(output))
def weighted_cosine_match(a, b): """Cosine match w/ weights based on order """ a_list = list(unique_everseen(a)) b_list = list(unique_everseen(b)) domain = list(unique_everseen(a_list + b_list)) a_vec = [ _weight_of(word, a_list, "descending") if word in a_list else 0 for word in domain ] b_vec = [ _weight_of(word, b_list, "ascending") if word in b_list else 0 for word in domain ] product = sum(x * y for x, y in zip(a_vec, b_vec)) a_mag = math.sqrt(sum(x * x for x in a_vec)) b_mag = math.sqrt(sum(x * x for x in b_vec)) return product / (a_mag * b_mag)
def parse1(self, a_pbed, input_file, type): if type == 's': str_list = [] str_list.append( '%%---Pebble unit cell with position from input file\n' + 'pbed %d %d "%s"\n' % (self.univ.id, a_pbed.coolant.gen.univ.id, input_file)) str_list.append('%%---Coolant in the unit cell\n' + a_pbed.coolant.generate_output()) str_list.append('%%---Pebbles in the unit cell(pbed)\n') for pb in list(unique_everseen(a_pbed.pb_list)): str_list.append(pb.generate_output()) return ''.join(str_list)
def get_groups_by_types( self, types: List[str]) -> Union[AlignmentGroup, List[AlignmentGroup]]: """Return AlignmentGroups by fragment type. :param types: list of types :return: """ groups = self.groups_by_type() if isinstance(types, str): return groups[types] else: return list(unique_everseen(flatten([groups[t] for t in types])))
def evaluate(predicts, real_labels): error = 1 - accuracy_score(predicts, real_labels) # Compute error accuracy_score_of_all = accuracy_score(predicts, real_labels) print(f"\nTotal accuracy of predictions: {accuracy_score_of_all}") cnf_mat = confusion_matrix(real_labels, predicts, list(unique_everseen(real_labels))) # Create confusion matrix print(cnf_mat) df_cm = pd.DataFrame(cnf_mat, index=list(unique_everseen(real_labels)), columns=list(unique_everseen(real_labels)),) col_sum = df_cm.sum(axis=1) df_cm = df_cm.div(col_sum,axis = 0) plt.figure(figsize=(20,20)) heatmap = sns.heatmap(df_cm, annot=True) heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=8) heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=8) plt.ylabel('True label') plt.xlabel('Predicted label') plt.title("Confusion Matrix, Normalized") plt.show() return error,cnf_mat
def names(vma): """ Obtain names of all coordinates defined in the V-Matrix. :param vma: V-Matrix :type vma: automol V-Matrix data structure :rtype: tuple(str) """ name_mat = name_matrix(vma) _names = filter(lambda x: x is not None, numpy.ravel(numpy.transpose(name_mat))) return tuple(more_itertools.unique_everseen(_names))
def getNodes(): Initconnection.Initconnection.loadConfig() Logging.Logging.log('Retrieving Nodes') apiV1 = client.CoreV1Api() nodes = apiV1.list_node() endpoints = [] for node in nodes.items: if node is None: Logging.Logging.log('No nodes found') else: endpoints.append(f'{node.metadata.name}') return list(unique_everseen(endpoints))
def find_topics(self,test): topics = [] tp_num = [] for quest in test.sscquestions_set.all(): tp_number = quest.topic_category try: tp_name = changeIndividualNames(tp_number,quest.section_category) except: tp_name = 'Topics' topics.append(tp_name) topics = list(unique_everseen(topics)) num_questions = len(test.sscquestions_set.all()) return topics,num_questions
def step(context, modes): context.execute_steps( u"""then wait for element {element} identified by css_selector""". format(element=DETECTMODE)) labels_list = context.browser.find_elements_by_css_selector(DETECTMODE) modes_list = modes.split(",") label_obtained_elements = [] for element in labels_list: if element.text != "": label_obtained_elements.append(element.text) label_obtained_elements = list(unique_everseen(label_obtained_elements)) assert modes_list == label_obtained_elements, "Error: the list of options is not the expected. Expected {expected}, obtained {obtained}".format( expected=modes_list, obtained=label_obtained_elements)
def getServices(): Initconnection.Initconnection.loadConfig() Logging.Logging.log('Retrieving Services') apiV1 = client.CoreV1Api() services = apiV1.list_service_for_all_namespaces() output = [] for service in services.items: if service is None: Logging.Logging.log('No services found') else: output.append(f'{service.metadata.name}') return list(unique_everseen(output))
def _filter_unique_results(results): return list( unique_everseen( results, key=lambda x: ( x["query"]["sequence_id"], x["query"]["start"], x["query"]["end"], x["subject"]["sequence_id"], x["subject"]["start"], x["subject"]["end"], ), ))
def df_to_mct(df): full_array = np.array(df) loadcases = np.array(df.columns.values)[1:] combinations = full_array[2:, 0] load_types = full_array[1, 1:] factors = full_array[2:, 1:] mct_string = '*LOADCOMB ; Combinations\n; NAME=NAME, KIND, ACTIVE, bES, iTYPE, DESC, iSERV-TYPE, nLCOMTYPE, nSEISTYPE ; line 1\n; ANAL1, LCNAME1, FACT1, ... ; from line 2' #Creates load combinations for row_index, row in enumerate(factors): combination = combinations[row_index] mct_string = mct_string + '\n' + ' NAME=' + str( combination) + ', GEN, ACTIVE, 0, 0, , 0, 0, 0\n' for factor_index, factor in enumerate(row): load_type = load_types[factor_index] loadcase = loadcases[factor_index] if factor_index != 0: mct_string = mct_string + ', ' mct_string = mct_string + str(load_type) + ', ' + str( loadcase) + ', ' + str(factor) #Creates combination envelopes combination_envelopes = [] for comb in combinations: combination_envelopes.append(comb.split('_')[0]) combination_envelopes.append(comb.split('|')[0]) combination_envelopes = list(unique_everseen(combination_envelopes)) combination_groups = [] for envelope in combination_envelopes: group = [] for comb in combinations: if envelope in comb: group.append(comb) combination_groups.append(group) for group_index, group in enumerate(combination_groups): mct_string = mct_string + '\n' + ' NAME=' + str( combination_envelopes[group_index] ) + ', GEN, ACTIVE, 0, 1, , 0, 0, 0\n' for comb_index, comb in enumerate(group): if comb_index != 0: mct_string = mct_string + ', ' mct_string = mct_string + 'CB' + ', ' + str(comb) + ', ' + '1.0' return mct_string
def gets_entity_ancestors(graph: Graph, uris: List[Union[URIRef, str]], rel: Union[URIRef, str] = RDFS.subClassOf, cls_lst: Optional[List] = None) -> List: """A method that recursively searches an ontology hierarchy to pull all ancestor concepts for an input entity. Args: graph: An RDFLib graph object assumed to contain ontology data. uris: A list of at least one ontology RDFLib URIRef object or string. rel: A string or RDFLib URI object containing a predicate. cls_lst: A list of URIs representing the ancestor classes found for the input class_uris. Returns: An ordered (desc; root to leaf) list of ontology objects containing the input uris ancestor hierarchy. Example: input: [URIRef('http://purl.obolibrary.org/NCBITaxon_11157')] output: ['http://purl.obolibrary.org/NCBITaxon_10239', 'http://purl.obolibrary.org/NCBITaxon_2559587', 'http://purl.obolibrary.org/NCBITaxon_2497569', 'http://purl.obolibrary.org/NCBITaxon_11157'] """ prop = rel if isinstance(rel, URIRef) else URIRef(rel) cls_lst = [] if cls_lst is None else cls_lst cls_lst = list( unique_everseen([ x if isinstance(x, URIRef) else URIRef(obo + x) for x in cls_lst ])) uris = list( unique_everseen( [x if isinstance(x, URIRef) else URIRef(obo + x) for x in uris])) ancs = list( unique_everseen( [j for k in [graph.objects(x, prop) for x in uris] for j in k])) if len(ancs) == 0 or len(set(ancs).difference(set(cls_lst))) == 0: return list(unique_everseen([str(x) for x in cls_lst])) else: uris = [x for x in ancs if x not in cls_lst] for i in uris: cls_lst.insert(0, i) return gets_entity_ancestors(graph, uris, prop, cls_lst)
def monte_carlo(forward_,reverse_): forward=[] reverse=[] fasta_sequences = SeqIO.parse(open(forward_),'fasta') for fasta in fasta_sequences: name, sequence = fasta.id, fasta.seq forward.append(str(sequence).replace('\n', '')) fasta_sequences = SeqIO.parse(open(reverse_),'fasta') for fasta in fasta_sequences: name, sequence = fasta.id, fasta.seq reverse.append(str(sequence).replace('\n', '')) seed=1000 # Iterations matches=[] count=0 cytosine=0 fh=open('forward_d/monte.carlo', 'w+') match = 2 mismatch = -1 scoring = swalign.NucleotideScoringMatrix(match, mismatch) sw = swalign.LocalAlignment(scoring) while seed != 0: sequence=forward[int(random.random()*(len(forward)-1))] # get a random sequence from [ 0, N ] contigs width= int(random.random()*(len(sequence)-1) *0.20+13) # Take lower 13+[0-20] % of sequence length first=int(random.random()*(len(sequence)-1-width)) subsequence=sequence[first:first+width] for x in reverse: index=x.find(subsequence) if index != -1 : count_=ccount(reverse,reverse.index(x))+index fh.write(str(count_)+"\t"+str(eval(str(count_+width)))+"\n") for i in range(0,width): matches.append(count_+i) count+=1 else: count_=ccount(reverse,reverse.index(x))+index alignment = sw.align(x,subsequence) cytosine+=alignment.dump() seed-=1 fh.close() forward_length=ccount(forward,"1",True) xlim=ccount(reverse,"1",True) plt.hlines(1,1,xlim) # Draw a horizontal line plt.xlim(0,xlim) plt.ylim(0.5,1.5) matches=list(unique_everseen(matches)) y = np.ones(np.shape(matches)) # Make all y values the same plt.plot(matches,y,'|',ms = 40) # Plot a line at each location specified in `matches` plt.axis('off') plt.show() print "[",count,"/ ",forward_length,"] hits, [",cytosine,"]"
def gen_Ck(k, L): flatten = [item for subtuple in L for item in subtuple] #flattens all candidates uniqueflatten = list( unique_everseen(flatten)) #gets out duplicate candidates Ck = list(combinations( uniqueflatten, k)) #creates list of all possible combinations of k length for c in Ck: if has_infeq_subsets(k, L, c): Ck.remove(c) else: pass return Ck
def get_image_url_list(category, data_type=None): """ For a given data type, returns a list of filenames. Args: category (string): Category name as a string (from list_image_sets()) data_type (string, optional): "train" or "val" Returns: list of strings: list of all filenames for that particular category """ df = _load_data(category, data_type=data_type) image_url_list = list(unique_everseen(list(img_dir + df['fname']))) return image_url_list
def unique_mail(emails: Iterator[Email]) -> Iterator[Email]: # remove duplicates (from a file being # in multiple boxes and the 'default' inbox) # some formats won't have a message id, # but hopefully the date/subject creates a unique # key in that case yield from unique_everseen( emails, key=lambda m: ( m.subject_json, m.message_id_json, m.dt, ), )
def get_int_mapping(dataframe, column): """ Returns index, reverse_index, and list of unique items in a pandas datframe """ # Convert series to list column_to_list = dataframe[column].tolist() # Find set of unique items and convert to a list unique_items_list = list(unique_everseen(column_to_list)) # Create indexes for each item item_index = {item: idx for idx, item in enumerate(unique_items_list)} index_item = {idx: item for item, idx in item_index.items()} return item_index, index_item, unique_items_list
def get_image_url_list(category, data_type=None): """ For a given data type, returns a list of filenames. Args: category (string): Category name as a string (from list_image_sets()) data_type (string, optional): "train" or "val" Returns: list of strings: list of all filenames for that particular category """ df = _load_data(category, data_type=data_type) image_url_list = list( unique_everseen(list(img_dir + df['fname']))) return image_url_list
def _get_header_add(table): """Finds and returns the header of a table. :param table: A basketball-reference stats table. :returns: Header of the table as a list of strings. """ def replace_titles(title): return multiple_replace( title, {'%': 'P', '3': 'T', '+/-': 'PlusMinus'}) titles = ( replace_titles(str(th.get_text())) for th in table.find_all('th') ) return list(unique_everseen(titles))
def _merged_column_names_from(self, dataset_list): elements = [] for idx_dataset, dataset in enumerate(dataset_list): # require getting the code from the dataset object always code = self.__dataset_objects__()[idx_dataset].code for index, column_name in enumerate(dataset.column_names): # only include column names that are not filtered out # by specification of the column_indexes list if self._include_column(dataset, index): # first index is the date, don't modify the date name if index > 0: elements.append(self._rename_columns(code, column_name)) else: elements.append(column_name) return list(unique_everseen(elements))
def check_segments_length(self, data, cls_label, ses_label): cls = [] cls_ind = [] for i in more_itertools.unique_everseen(itertools.izip(cls_label, ses_label)): cls.append(i) start = np.where((cls_label == i[0]) & (ses_label == i[1]))[0][0] stop = np.where((cls_label == i[0]) & (ses_label == i[1]))[0][-1]+1 cls_ind.append([start, np.min([start+self.buff_size, stop])]) if data[(cls_label == i[0]) & (ses_label == i[1]), :].shape[0] > self.buff_size: ind = np.where((cls_label == i[0]) & (ses_label == i[1]))[0] if self.verbose > 0: print "segment {0} to {1} is too long (length={2}, buffer={3})"\ "\n please increase buffer size or segment will be truncated"\ .format(ind[0], ind[-1], ind[-1]-ind[0]+1, self.buff_size) return np.asarray(cls), np.asarray(cls_ind)
def writeStory(): markovText = "" text = "" topic = sys.argv[1] post_limit = 5 if sys.argv[2] == "twitter": # Get topic to search tweets = searchTweets(topic) # Parse Tweets for tweet in tweets: if len(tweet.strip()) > 5: # Remove social crap from tweets (credit: https://stackoverflow.com/a/8377440) stripped = lambda tweet: re.compile('\#').sub('', re.compile('RT @').sub('@', tweet, count=1)) text += ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",stripped(tweet)).split()) + '. ' text = re.sub("( s )","\'s ",text) text = re.sub("( ll )","\'ll ",text) text = re.sub("( t )","\'t ",text) text = re.sub("( ve )","\'ve ",text) text = re.sub("(https|http)"," ",text) text = re.sub("( m )","\'m ",text) text = re.sub("(y all)","y\'all",text) text = re.sub("( amp )"," & ",text) text = re.sub("( re )","\'r ",text) text = re.sub("( d )","\'d ",text) text = re.sub("(NoneNone)"," ",text) # Generate markov splitText = generateMarkov(text).rsplit(' ', 1)[0].split(".") for tweet in list(unique_everseen(splitText)): markovText+=tweet+'. ' elif sys.argv[2] == "reddit": r = praw.Reddit(user_agent='storytime script: amorgan.me/storytime') r.login('storytime_bot', 'Jnw5v9pgbHW6qXXrNb24EJkGeE8KwZ', disable_warning=True) submissions = r.get_subreddit(topic).get_new(limit=10) # Get comments for x in xrange(0,post_limit): submission = next(submissions) #comments = praw.helpers.flatten_tree(submission.comments) # Parse comments print (submission) #for comment in submission.comments: #markovText+=comment.body # print(comment.body) print (markovText)
def parse1(self, a_pbed, input_file, type): if type == 's': str_list = [] str_list.append( '%%---Pebble unit cell with position from input file\n' + 'pbed %d %d "%s"\n' % (self.univ.id, a_pbed.coolant.gen.univ.id, input_file)) str_list.append( '%%---Coolant in the unit cell\n' + a_pbed.coolant.generate_output()) str_list.append( '%%---Pebbles in the unit cell(pbed)\n') for pb in list(unique_everseen(a_pbed.pb_list)): str_list.append(pb.generate_output()) return ''.join(str_list)
def get_num_concepts(self): """Calculates the oncepts in the text """ # Inititialize empty list concepts = [] pairs = self.word_pairs for pair in pairs: first = pair[0] second = pair[1] concepts.append(first) concepts.append(second) num_concepts = len(list(unique_everseen(concepts))) return num_concepts
def __init__(self): #init DB connection from Database_Connection import Database_Connection database_connection = Database_Connection() #get list of words from the database self.list_of_words = database_connection.get_entries_as_list() words_from_database = database_connection.get_entries_as_list() #remove quotes because it breaks SQLite for i in range(0, len(words_from_database)): words_from_database[i] = words_from_database[i].encode('ascii','ignore') words_from_database[i] = words_from_database[i].replace("'", "") #todo - fix this shit words_from_database[i] += " " #add the words that were just downloaded from Facebook self.list_of_words.extend(self.load_words_from_json()) words_from_json = self.load_words_from_json() #remove quotes because it breaks SQLite for i in range(0, len(words_from_json)): words_from_json[i] = words_from_json[i].encode('ascii','ignore') words_from_json[i] = words_from_json[i].replace("'", "") #todo - fix this shit words_from_json[i] += " " #remove duplicates from more_itertools import unique_everseen mega_list = self.list_of_words #normailze megalist for deleting duplicates mega_list = [string.strip() for string in mega_list] pruned_list = list(unique_everseen(mega_list)) print "mega list - {0}".format(len(mega_list)) print "pruned list - {0}".format(len(pruned_list)) #remove quotes because it breaks SQLite for i in range(0, len(pruned_list)): pruned_list[i] = pruned_list[i].encode('ascii','ignore') pruned_list[i] = pruned_list[i].replace("'", "") #todo - fix this shit pruned_list[i] += " " database_connection.wipe_database() database_connection.add_list_to_database(pruned_list)
def get_urls(self, soup): urls = [] tags = soup.find_all(href=True) + soup.find_all(src=True) for tag in tags: if tag.get('href') is not None: if self.url not in tag.get('href'): urls.append(urljoin(self.url, tag.get('href'))) else: urls.append(tag.get('href')) if tag.get('src') is not None: if self.url not in tag.get('src'): urls.append(urljoin(self.url, tag.get('src'))) else: urls.append(tag.get('src')) return unique_everseen(urls)
def reorder_by_paths(x, paths): """ Reorder the columns activity matrix according to a set of the most probable paths through a network. Useful for visualizing repeated sequences. :param x: activity matrix (rows are timepoints, cols are nodes) :param paths: list of tuple paths :return: x with optimally permuted columns """ # find all the nodes that appear in probable paths, ordered by those paths ordering = list(unique_everseen(chain.from_iterable(paths))) # add in the rest of the nodes randomly for node in range(x.shape[1]): if node not in ordering: ordering.append(node) # reorder the columns return x[:, np.array(ordering)], ordering
def harvestobsids(): baseurl = "http://waarnemingen.be/soort/view/%s?from=1900-01-01&to=2100-01-01&rows=100&page=%s" conn = sqlite3.connect("pissebed.db") c = conn.cursor() ids = c.execute("select id from species") ids = list(itertools.chain.from_iterable(ids)) for id in ids: page = 1 previous = [] while True: try: url = baseurl % (id, page) res = br.open(url).read() obsids = list(unique_everseen(extractobsids(res))) print "Species " + str(id) + ", page " + str(page) + ": found " + str(len(obsids)) + " observations" if cmp(previous, obsids) == 0: break previous = obsids for obsid in obsids: records = c.execute("select * from observations where id = ?", (obsid,)).fetchall() if len(records) == 0: print "New observation: %s" % obsid c.execute("insert into observations (id, species_id) values (?, ?)", (obsid, id)) conn.commit() else: print "Observation %s already exists" % obsid except Exception, e: print "Error: " + str(e) page = page + 1 if page > maxpages: break time.sleep(random.randint(mindelay, maxdelay))