def get_data_state(name: str): try: name = name.lower() response = requests.get( ApiBrazilState.URL_STATE.value.format(state=name), timeout=2) print(f"get_data_state >>> {response.status_code}") if response.status_code == 200: response = response.json() if not response.get('error', None): return pasrse_json(response) else: print( "estado não encontrado pela UF. Tentando buscar pelo nome do estado." ) response = requests.get(ApiBrazilState.URL_STATES.value, timeout=2) response = pasrse_json(response.json()).get('data') name = strip_accents(name).lower() state = [ row for row in response if strip_accents(row['state']).lower() == name ] if state: return state.pop() except Exception as exc: print(f"erro na requisição: {exc}") return None
def gen_filename(record): """ Guess the expected filename from the record. Args: record (dict): a record of the bibtex entry. Returns: A string which corresponds to guessed filename (expected to be a pdf). """ record_copy = record.copy() record_copy = bibtexparser.customization.author(record_copy) # Retrieve a stripped down last name of the first authors last_names = [] for author in record_copy['author']: stripped = utils.strip_accents(codecs.decode(author, "ulatex")) name = re.sub('([\\{\\}])', '', stripped.split(',')[0]) name = re.sub('~', ' ', name) name = re.sub("\\\\'ı", "i", name) name = re.sub("\\\\`ı", "i", name) name = re.sub("ı", "i", name) name = re.sub('\xf8', 'o', name) name = re.sub('\\\\textquotesingle ', "'", name) name = name.replace('ł', 'l') last_names.append(name) # If there are more than 4 authors, use the 'et al.' form if len(last_names) > 4: prefix = '(' + last_names[0] + ' et al.) ' else: prefix = '(' + ', '.join(last_names) + ') ' title = utils.get_title(record_copy) title = title.replace('$\\Lambda_{훜fty}$ ', 'λ∞') title = re.sub('\\\\textendash ', '- ', title) title = utils.strip_accents(codecs.decode(title, "ulatex")) title = re.sub('([\\{\\}])', '', title) title = re.sub(' *: ', ' - ', title) title = re.sub(' *— *', ' - ', title) title = re.sub('–', '-', title) title = re.sub('/', '-', title) # title = re.sub('\\$\\mathplus \\$', '+', title) title = re.sub('\\\\textquotesingle ', "'", title) title = to_titlecase(title) title = re.sub('"', '', title) title = re.sub('’', "'", title) title = re.sub('\u2010', '-', title) title = re.sub('\u2122', '', title) title = title.replace('$\\texttt FreeFem++$', 'FreeFem++') title = title.replace('$\\lambda _\\Infty $ ', 'λ∞') return prefix + title + '.pdf'
def get_data_city(name: str): try: response = requests.get(ApiBrazilCity.URL_CITY.value, timeout=2) print(f"get_data_city >>> {response.status_code}") if response.status_code == 200: name = name.lower() data = response.json().get('docs') data_city = [] for item in data: city_name = strip_accents(item["city_name"]).lower() if city_name == name: data_city.append(item) data_city.sort(key=lambda x: x["date"]) print(f"get_data_city >>> {city_name}, {name}") return data_city.pop() if len(data_city) else None except Exception as exc: print(f"erro na requisão: {exc}") return None
def update_video(vidobj, statusid, filename=None): vidobj.statusid = statusid vidobj.lastupdated = datetime.now() if filename is not None: filename = utils.strip_accents(filename) filename = utils.clean_special_chars(filename) vidobj.filename = filename vidobj.save()
def answer(self, question): pred_relation = www2fb(get_relation(question, self.questions, self.model, self.index2rel, self.args)) query_tokens = get_query_text(question, self.questions, self.ent_model, self.index2tag, self.args) N = min(len(query_tokens), 3) C = [] # candidate entities for n in range(N, 0, -1): ngrams_set = find_ngrams(query_tokens, n) for ngram_tuple in ngrams_set: ngram = " ".join(ngram_tuple) ngram = strip_accents(ngram) # unigram stopwords have too many candidates so just skip over if ngram in stopwords: continue ## PROBLEM! - ngram doesnt exist in index - at test-2592 - KeyError: 'p.a.r.c.e. parce' try: cand_mids = self.index_ent[ngram] # search entities except: continue C.extend(cand_mids) if (len(C) > 0): break break C_pruned = [] for mid in set(C): if mid in self.index_reach.keys(): # PROBLEM: don't know why this may not exist?? count_mid = C.count(mid) # count number of times mid appeared in C C_pruned.append((mid, count_mid)) if pred_relation in self.index_reach[mid]: count_mid = C.count(mid) # count number of times mid appeared in C C_pruned.append((mid, count_mid)) num_entities_fbsubset = 1959820 # 2M - 1959820 , 5M - 1972702 C_tfidf_pruned = [] for mid, count_mid in C_pruned: if mid in self.index_names.keys(): cand_ent_name = pick_best_name(question, self.index_names[mid]) tfidf = calc_tf_idf(query_tokens, cand_ent_name, count_mid, num_entities_fbsubset, self.index_ent) C_tfidf_pruned.append((mid, cand_ent_name, tfidf)) C_tfidf_pruned.sort(key=lambda t: -t[2]) pred_ent, name_ent, score = C_tfidf_pruned[0] key = (pred_ent, pred_relation) if key not in self.fb_graph: return "UNKNOWN" result_mid = self.fb_graph[key] result_mid = list(result_mid) result = get_names(self.fb_graph, result_mid)[0] return result
def gen_bibkey(record, all_keys): """ Generate a unique bibtex key for the given record. Args: record (dict): a record of the bibtex entry. all_keys (set): a set of existing bibtex keys in the current context. Returns: A string which corresponds to the newly generated unique bibtex key. The argument 'all_keys' is also appended with the new key. """ for field in ['year', 'title', 'author']: if field not in record: record_str = json.dumps(record, sort_keys=True, indent=4, separators=(',', ': ')) raise ValueError( "Missing field '{0}' in bibtex entry:\n{1}".format( field, record_str)) record_copy = record.copy() record_copy = bibtexparser.customization.author(record_copy) # Retrieve a stripped down last name of the first author first_author = record_copy['author'][0] stripped = utils.strip_accents(codecs.decode(first_author, "ulatex")) last_name = stripped.split(',')[0] last_name = last_name.replace('ø', 'o') last_name = last_name.replace('ł', 'l') last_name = re.sub('([^a-zA-Z])', '', last_name) # Then get the first 3 initials of the article title curated_title = re.sub('([^a-zA-Z])', ' ', utils.get_title(record_copy)) short_title = ''.join(s[0] for s in curated_title.split()) short_title += curated_title.split()[-1][1:] short_title = short_title[:3].upper() # Key is Author:Year:Initials basekey = last_name + ":" + record_copy['year'] + ":" + short_title bibkey = basekey # Assign a unique key tail = 'a' while bibkey in all_keys: bibkey = basekey + tail tail = chr((ord(tail) + 1)) all_keys.add(bibkey) return bibkey
def movie_query(self, title, year, single_query=False, caller_name=None): title = strip_accents(title) if self.caller_name is None: if caller_name is None: caller_name = get_caller_name() self.caller_name = caller_name self.title = source_utils.clean_title(title) self.year = year full_query = '%s %s' % (title, year) use_cache_only = self._get_cache(full_query) if use_cache_only: return self._get_movie_results() skip_set_cache = False try: self._url = self._find_url() if self._url is None: self._set_cache(full_query) return self._get_movie_results() movie = lambda query: self._query_thread(query, [self.filter_movie_title]) queries = [movie(self.title + ' ' + self.year)] try: alternative_title = replace_text_with_int(self.title) if self.title != alternative_title: queries.append(movie(alternative_title + ' ' + self.year)) except: pass wait_threads(queries) if len( self._temp_results ) == 0 and not single_query and not self._request.self.has_timeout_exc: self._set_cache(full_query) skip_set_cache = True wait_threads([movie(self.title)]) if not skip_set_cache: self._set_cache(full_query) return self._get_movie_results() except: if not skip_set_cache: self._set_cache(full_query) return self._get_movie_results()
def gen_bibkey(record, all_keys): """ Generate a unique bibtex key for the given record. Args: record (dict): a record of the bibtex entry. all_keys (set): a set of existing bibtex keys in the current context. Returns: A string which corresponds to the newly generated unique bibtex key. The argument 'all_keys' is also appended with the new key. """ for field in ['year', 'title', 'author']: if field not in record: record_str = json.dumps(record, sort_keys=True, indent=4, separators=(',', ': ')) raise ValueError("Missing field '{0}' in bibtex entry:\n{1}".format(field, record_str)) record_copy = record.copy() record_copy = bibtexparser.customization.author(record_copy) # Retrieve a stripped down last name of the first author first_author = record_copy['author'][0] stripped = utils.strip_accents(codecs.decode(first_author, "ulatex")) last_name = stripped.split(',')[0] last_name = last_name.replace('ø', 'o') last_name = last_name.replace('ł', 'l') last_name = re.sub('([^a-zA-Z])', '', last_name) # Then get the first 3 initials of the article title curated_title = re.sub('([^a-zA-Z])', ' ', utils.get_title(record_copy)) short_title = ''.join(s[0] for s in curated_title.split()) short_title += curated_title.split()[-1][1:] short_title = short_title[:3].upper() # Key is Author:Year:Initials basekey = last_name + ":" + record_copy['year'] + ":" + short_title bibkey = basekey # Assign a unique key tail = 'a' while bibkey in all_keys: bibkey = basekey + tail tail = chr((ord(tail) + 1)) all_keys.add(bibkey) return bibkey
def format_msg_state(self, data, name): result = Messages.NOT_FOUND_STATE.value if isinstance(data, dict): state = ParseDictAsObj(data) name = strip_accents(state.state).lower().replace(' ', '-') flag = download_img( UrlFlag.URL_FLAG_STATE.value.format( state=name if name != 'sao-paulo' else name + '1')) result = (f"UF: {state.uf}\n" f"Estado: {state.state}\n" f"Confirmados: {state.cases}\n" f"Suspeitos: {state.suspects}\n" f"Casos descartados: {state.refuses}\n" f"Mortes: {state.deaths}") return result, flag
def generate_documents(): events = r.keys('event:*:title') for event_key in events: event_id = id(event_key) lang = r.get('event:' + event_id + ':lang') docs = r.keys('document:*:' + event_id) documents[event_id] = [] for doc_key in docs: doc_id = id(doc_key) tweet_ids = r.lrange('document:' + doc_id + ':tweets', 0, -1) document = [] for tweet_id in tweet_ids: # esto se puede mejorar... tweet = utils.remove_entities(tweet_id) tweet = parser.unescape(' '.join(tweet.split())) if len(tweet) == 0 or len(tweet.split()) == 0: continue tweet = utils.strip_accents(tweet) tweet = utils.remove_stopwords(tweet, lang) tweet = ' '.join([stemmers[lang].stem(token) for token in tweet.split()]) document.append(tweet) documents[event_id].append(' '.join(document))
def generate_documents_for(event_id): lang = r.get('event:' + event_id + ':lang') if lang is None: lang = 'spanish' docs = r.keys('document:*:' + event_id) documents[event_id] = [] documents_ids[event_id] = [] keys = [] for eid in docs: keys.append(id(eid)) docs = set(keys) for doc_id in docs: #doc_id = id(doc_key) # fb no se dejo resolver, y quedan muchos documentos apuntando a unsuportedbrowser # se ignora fb mientras no se arregle este problema url = r.get('document:%s:url' % doc_id) if urlparse(url).netloc == 'www.facebook.com': continue documents_real_ids.append(doc_id) tweet_ids = r.lrange('document:' + doc_id + ':tweets', 0, -1) documents_ids[event_id].append(tweet_ids) document = [] for tweet_id in tweet_ids: # esto se puede mejorar... tweet = utils.remove_entities(tweet_id) tweet = parser.unescape(' '.join(tweet.split())) if len(tweet) == 0 or len(tweet.split()) == 0: continue tweet = utils.strip_accents(tweet) tweet = utils.remove_stopwords(tweet, lang) tweet = ' '.join([stemmers[lang].stem(token) for token in tweet.split()]) document.append(tweet) documents[event_id].append(' '.join(document))
def get_search_terms_news(redis, news_id, lang): # obtener todas las paginas hijas del event id=news_id # que no hayan sido procesadas antes* keys = redis.keys('page:*:news_%s' % news_id) terms = [] for key in keys: id = key.split(':')[1] got = redis.get('page:%s:searched' % id) # para poder buscar 2 veces tweets de una pagina de un evento if got is None or got < 2: title = redis.get('page:%s:title' % id) title = title.decode('utf-8', errors='ignore') title = h.unescape(title) title = utils.strip_accents(title) title = utils.remove_stopwords(title, lang=lang) terms.append(title) redis.incr('page:%s:searched' % id) print tag, 'got', len(terms), 'search terms for news' return terms
def postprocessing_td012(td012): table = td012.copy() is_rpn = table.rpn > 0 is_rpint = table.rpint > 0 is_chaudiere = is_rpint | is_rpn is_chaudiere = is_chaudiere | ~table.tv038_puissance_nominale_id.isnull() # all text description raw concat gen_ch_concat_txt_desc = table['tv031_type_generateur'].astype( 'string').replace(np.nan, '') + ' ' gen_ch_concat_txt_desc.loc[is_chaudiere] += 'chaudiere ' gen_ch_concat_txt_desc += table['tv036_type_chaudiere'].astype( 'string').replace(np.nan, ' ') + ' ' gen_ch_concat_txt_desc += table["tv030_type_installation"].astype( 'string').replace(np.nan, ' ') + ' ' gen_ch_concat_txt_desc += table["tv032_type_generateur"].astype( 'string').replace(np.nan, ' ') + ' ' gen_ch_concat_txt_desc += table['tv035_type_chaudiere'].astype( 'string').replace(np.nan, ' ') + ' ' gen_ch_concat_txt_desc += table['tv036_type_generation'].astype( 'string').replace(np.nan, ' ') + ' ' gen_ch_concat_txt_desc += table["tv030_type_installation"].astype( 'string').replace(np.nan, ' ') + ' ' gen_ch_concat_txt_desc += table["tr004_description"].astype( 'string').replace(np.nan, ' ') + ' ' gen_ch_concat_txt_desc += table["tv045_energie"].astype('string').replace( np.nan, ' ') + ' ' gen_ch_concat_txt_desc += table['tv046_nom_reseau'].isnull().replace({ False: 'réseau de chaleur', True: "" }) gen_ch_concat_txt_desc = gen_ch_concat_txt_desc.str.lower().apply( lambda x: strip_accents(x)) table['gen_ch_concat_txt_desc'] = gen_ch_concat_txt_desc table['gen_ch_concat_txt_desc'] = table['gen_ch_concat_txt_desc'].apply( lambda x: clean_str(x)) # calcul gen_ch_lib_infer par matching score text. unique_gen_ch = table.gen_ch_concat_txt_desc.unique() gen_ch_lib_infer_dict = { k: affect_lib_by_matching_score(k, gen_ch_normalized_lib_matching_dict) for k in unique_gen_ch } table['gen_ch_lib_infer'] = table.gen_ch_concat_txt_desc.replace( gen_ch_lib_infer_dict) # calcul type energie chauffage table['type_energie_chauffage'] = table['tv045_energie'].replace( replace_elec_tv045_ener) # recup/fix PAC is_pac = (table.coefficient_performance > 2) | (table.rendement_generation > 2) table.loc[is_pac, 'gen_ch_lib_infer'] = table.loc[ is_pac, 'coefficient_performance'].replace(pac_dict) is_ind = is_pac & ( ~table.loc[is_pac, 'gen_ch_lib_infer'].isin(pac_dict.values())) table.loc[is_pac, 'gen_ch_lib_infer'] = table.loc[ is_pac, 'rendement_generation'].replace(pac_dict) is_ind = is_pac & ( ~table.loc[is_pac, 'gen_ch_lib_infer'].isin(pac_dict.values())) table.loc[is_ind, 'gen_ch_lib_infer'] = 'pac indeterminee' # recup/fix poele bois is_bois = table.gen_ch_concat_txt_desc == 'bois, biomasse bois, biomasse' table.loc[is_bois, 'gen_ch_lib_infer'] = table.loc[ is_bois, 'rendement_generation'].replace(poele_dict) is_ind = is_bois & ( ~table.loc[is_bois, 'gen_ch_lib_infer'].isin(poele_dict.values())) table.loc[is_ind, 'gen_ch_lib_infer'] = 'non affecte' # recup reseau chaleur non_aff = table.gen_ch_lib_infer == 'non affecte' reseau_infer = non_aff & (table.rendement_generation == 0.97) & ( table.tr004_description == 'Autres énergies') table.loc[reseau_infer, 'gen_ch_lib_infer'] = 'reseau de chaleur' table.loc[reseau_infer, 'type_energie_chauffage'] = 'Réseau de chaleurs' table['gen_ch_lib_infer_simp'] = table.gen_ch_lib_infer.replace( gen_ch_lib_simp_dict) # fix chaudiere elec bool_ej = table.gen_ch_lib_infer == 'autres emetteurs a effet joule' bool_ce = table.rendement_generation == 0.77 table.loc[(bool_ej) & (bool_ce), 'gen_ch_lib_infer'] = 'chaudiere electrique' rendement_gen_u = table[[ 'rendement_generation', 'coefficient_performance' ]].max(axis=1) s_rendement = pd.Series(index=table.index) s_rendement[:] = 1 for rendement in [ 'rendement_distribution_systeme_chauffage', 'rendement_emission_systeme_chauffage' ]: r = table[rendement].astype(float) r[r == 0] = 1 r[r.isnull()] = 1 s_rendement = s_rendement * r rendement_gen_u[rendement_gen_u == 0] = 1 rendement_gen_u[rendement_gen_u.isnull()] = 1 s_rendement = s_rendement * rendement_gen_u table['besoin_chauffage_infer'] = table[ 'consommation_chauffage'] * s_rendement return table
def sources(self, simple_info, hostDict, hostprDict): if simple_info is None: return [] supported_hosts = hostDict + hostprDict sources = [] try: query_type = None if simple_info.get('title', None) is not None: query_type = 'movie' query = '%s %s' % (strip_accents( simple_info['title']), simple_info['year']) else: query_type = 'episode' query = '%s S%sE%s' % (strip_accents( simple_info['show_title']), simple_info['season_number_xx'], simple_info['episode_number_xx']) if len(supported_hosts) > 0: url = self.scraper._find_url() def search(url): try: result = self.search(url, query) if result is None: raise requests.exceptions.RequestException() return result except requests.exceptions.RequestException: url = self.scraper._find_next_url(url) if url is None: return [] return search(url) hoster_results = search(url) if url is not None else [] else: hoster_results = [] for result in hoster_results: quality = source_utils.get_quality(result.title) if query_type == 'movie' and not source_utils.filter_movie_title( result.title, simple_info['title'], simple_info['year']): continue if query_type == 'episode' and not source_utils.filter_single_episode( simple_info, result.title): continue for url in result.urls: domain = re.findall(r"https?:\/\/(www\.)?(.*?)\/.*?", url)[0][1] if domain not in supported_hosts: continue if any(x in url for x in ['.rar', '.zip', '.iso']): continue quality_from_url = source_utils.get_quality(url) if quality_from_url != 'SD': quality = quality_from_url sources.append({ 'release_title': strip_non_ascii_and_unprintable(result.title), 'source': domain, 'quality': quality, 'language': 'en', 'url': url, 'info': [], 'direct': False, 'debridonly': False }) sources.reverse() result_count = len( sources) if len(supported_hosts) > 0 else 'disabled' tools.log( 'a4kScrapers.%s.%s: %s' % (query_type, self._caller_name, result_count), 'notice') return sources except: traceback.print_exc() return sources
bert_reader = BertReader(args) ansrini_searcher = build_searcher(args.k1, args.b, args.index_path, args.rm3, chinese=args.chinese) count_hit = [0] * (args.para_num) count_total = [0] * (args.para_num) all_results = [] for question_id in trange(len(QAs)): start_time = time.time() question = strip_accents( QAs[question_id]['question']) # convert Latin into English if args.chinese: if args.toSimplified: question = HanziConv.toSimplified(question) paragraphs = anserini_retriever(question, ansrini_searcher, args.para_num) else: paragraphs = anserini_retriever(question, ansrini_searcher, args.para_num) if len(paragraphs) == 0: continue paragraph_texts = [] paragraph_scores = [] hit_flag = False for paragraph_id, paragraph in enumerate(paragraphs): paragraph_texts.append(paragraph['text'])
def get_disc_info(self): """ Returns information about the selected disc Inputs: None Outputs: None """ proc = subprocess.Popen( [ '%smakemkvcon' % self.makemkvconPath, '-r', 'info', 'dev:/dev/sr0', '--decrypt', '--minlength=%d' % self.minLength, '--messages=/tmp/makemkvMessages' ], stderr=subprocess.PIPE ) (results, errors) = proc.communicate() if proc.returncode is not 0: self.log.error( "MakeMKV (get_disc_info) returned status code: %d" % proc.returncode) if errors is not None: if len(errors) is not 0: self.log.error("MakeMKV encountered the following error: ") self.log.error(errors) return False foundtitles = int(self._read_mkv_messages("TCOUNT")[0]) self.log.debug("MakeMKV found {} titles".format(foundtitles)) if foundtitles > 0: for titleNo in set(self._read_mkv_messages("TINFO")): durTemp = self._read_mkv_messages("TINFO", titleNo, 9)[0] x = time.strptime(durTemp, '%H:%M:%S') titleDur = datetime.timedelta( hours=x.tm_hour, minutes=x.tm_min, seconds=x.tm_sec ).total_seconds() if self.vidType == "tv" and titleDur > self.maxLength: self.log.debug("Excluding Title No.: {}, Title: {}. Exceeds maxLength".format( titleNo, self._read_mkv_messages("TINFO", titleNo, 27) )) continue if self.vidType == "movie" and not re.search('00', self._read_mkv_messages("TINFO", titleNo, 27)[0]): self.log.debug("Excluding Title No.: {}, Title: {}. Only want first title".format( titleNo, self._read_mkv_messages("TINFO", titleNo, 27) )) continue self.log.debug("MakeMKV title info: Disc Title: {}, Title No.: {}, Title: {}, ".format( self._read_mkv_messages("CINFO", 2), titleNo, self._read_mkv_messages("TINFO", titleNo, 27) )) title = self._read_mkv_messages("TINFO", titleNo, 27)[0] rename_title = utils.strip_accents(title) rename_title = utils.clean_special_chars(rename_title) self.saveFiles.append({ 'index': titleNo, 'title': title, 'rename_title': rename_title, })
def episode_query(self, simple_info, auto_query=True, single_query=False, caller_name=None, exact_pack=False): simple_info['show_title'] = strip_accents(simple_info['show_title']) if self.caller_name is None: if caller_name is None: caller_name = get_caller_name() self.caller_name = caller_name simple_info['show_aliases'] = list(set(simple_info['show_aliases'])) if '.' in simple_info['show_title']: no_dot_show_title = simple_info['show_title'].replace('.', '') simple_info['show_aliases'].append(no_dot_show_title) for alias in simple_info['show_aliases']: if '.' in alias: simple_info['show_aliases'].append(alias.replace('.', '')) self.simple_info = simple_info self.year = simple_info['year'] self.country = simple_info['country'] self.show_title = source_utils.clean_title(simple_info['show_title']) if self.year in self.show_title: self.show_title_fallback = re.sub( r'\s+', ' ', self.show_title.replace(self.year, '')) else: self.show_title_fallback = None self.episode_title = source_utils.clean_title( simple_info['episode_title']) self.season_x = simple_info['season_number'] self.episode_x = simple_info['episode_number'] self.season_xx = self.season_x.zfill(2) self.episode_xx = self.episode_x.zfill(2) #full_query = '%s %s %s %s %s' % (self.show_title, self.year, self.season_xx, self.episode_xx, self.episode_title) # use_cache_only = self._get_cache(full_query) # if use_cache_only: # return self._get_episode_results() try: self._url = self._find_url() if self._url is None: #self._set_cache(full_query) return self._get_episode_results() if auto_query is False: wait_threads([self._episode('')]) #self._set_cache(full_query) return self._get_episode_results() def query_results(): if DEV_MODE: if self.caller_name != 'eztv': wait_threads([ self._season(self.show_title + ' S%s' % self.season_xx) ]) else: wait_threads([ self._episode(self.show_title + ' S%sE%s' % (self.season_xx, self.episode_xx)) ]) return # specials if self.season_x == '0': wait_threads([ self._episode_special(self.show_title + ' %s' % self.episode_title) ]) #self._set_cache(full_query) return queries = [ self._episode(self.show_title + ' S%sE%s' % (self.season_xx, self.episode_xx)) ] if single_query: #self._set_cache(full_query) wait_threads(queries) return if exact_pack: queries = queries + [ self._season_and_pack(self.show_title + '.S%s.' % self.season_xx) ] else: queries = queries + [ self._season(self.show_title + ' Season ' + self.season_x), self._season(self.show_title + ' S%s' % self.season_xx), self._pack(self.show_title + ' Seasons'), self._season_and_pack(self.show_title + ' Complete') ] if simple_info.get('isanime', False) and simple_info.get( 'absolute_number', None) is not None: queries.insert( 0, self._episode(self.show_title + ' %s' % simple_info['absolute_number'])) if self._use_thread_for_info: wait_threads([queries[0]]) else: wait_threads(queries) query_results() if len(self._temp_results ) == 0 and self.show_title_fallback is not None: self.show_title = self.show_title_fallback self.simple_info['show_title'] = self.show_title_fallback query_results() #self._set_cache(full_query) return self._get_episode_results() except: #self._set_cache(full_query) return self._get_episode_results()
def postprocessing_ecs_ft(td005): td005_ecs = td005.loc[td005.tr011_sous_categorie_fiche_technique_id == '17'] vr_ecs = td005_ecs.valeur_renseignee.str.lower().apply(lambda x: strip_accents(x)) sys_ecs_lib_infer_ft=vr_ecs.apply(lambda x:affect_lib_by_matching_score(x,gen_ecs_normalized_lib_matching_dict_ft))
def lookup_muni(name_muni=None, code_muni=None, verbose=False): """ Lookup municipality codes and names. Input a municipality NAME or CODE and get the names and codes of the municipality's corresponding state, meso, micro, intermediate, and immediate regions. You should not select both code_muni and name_muni Parameters ---------- name_muni : str, optional The municipality name to be looked up code_muni: str, optional The municipality code to be looked up verbose : bool, optional by default False Returns ------- data.frame with 13 columns identifying the geographies information of that municipality Details Only available from 2010 Census data so far Raise ------- Exception if code_muni or name_muni cannot be found Example ------- >>> import geobr # Lookup table for municipality of Rio de Janeiro >>> mun = lookup_muni('Rio de Janeiro) or >>> mun = lookup_muni(3304557) # lookup table for all municipalities >>> mun_all = lookup_muni("all") """ # Get metadata with data url addresses temp_meta = utils.select_metadata(geo='lookup_muni', year=2010) # Read DataFrame available at provided url lookup_table_2010 = utils.download_metadata( temp_meta.loc[:, 'download_path'].to_list()[0]) lookup_table_2010['name_muni_format'] = lookup_table_2010[ 'name_muni_format'].str.lower() # Search by inputs if code_muni == 'all' or name_muni == 'all' or (code_muni is None and name_muni is None): if verbose: print(f"Returning results for all municipalities") return lookup_table_2010.iloc[:, :-1] elif code_muni is not None: if name_muni is not None: if verbose: print("Ignoring argument name_muni") try: output = lookup_table_2010[lookup_table_2010['code_muni'] == int( code_muni)].iloc[:, :-1] if verbose: print( f"Returning results for municipality {output.loc[:, 'name_muni'].to_list()[0]}" ) return output except KeyError: raise Exception( f'The `code_muni` argument {code_muni} was not found in the database.' ) elif name_muni is not None: # Cleaning from accents and turning into lower cases without spaces name_muni = utils.strip_accents(str(name_muni).lower().strip()) output = lookup_table_2010[lookup_table_2010['name_muni_format'] == name_muni] if len(output) == 0: if verbose: print("Please insert a valid municipality name") raise Exception( f'The `name_muni` argument {name_muni} was not found in the database.' ) else: if verbose: print( f"Returning results for municipality {output.loc[:, 'name_muni'].to_list()[0]}" ) return output.iloc[:, :-1] elif code_muni == 'all' and name_muni == 'all': if verbose: print( "Please insert either a municipality name or a municipality code" )
def get_disc_info(self): """ Returns information about the selected disc Inputs: None Outputs: None """ proc = subprocess.Popen( [ '%smakemkvcon' % self.makemkvconPath, '-r', 'info', 'disc:%d' % self.discIndex, '--decrypt', '--minlength=%d' % self.minLength, '--messages=/tmp/makemkvMessages' ], stderr=subprocess.PIPE ) (results, errors) = proc.communicate() if proc.returncode is not 0: self.log.error( "MakeMKV (get_disc_info) returned status code: %d" % proc.returncode) if errors is not None: if len(errors) is not 0: self.log.error("MakeMKV encountered the following error: ") self.log.error(errors) return False foundtitles = int(self._read_mkv_messages("TCOUNT")[0]) self.log.debug("MakeMKV found {} titles".format(foundtitles)) if foundtitles > 0: for titleNo in set(self._read_mkv_messages("TINFO")): durTemp = self._read_mkv_messages("TINFO", titleNo, 9)[0] x = time.strptime(durTemp, '%H:%M:%S') titleDur = datetime.timedelta( hours=x.tm_hour, minutes=x.tm_min, seconds=x.tm_sec ).total_seconds() if self.vidType == "tv" and titleDur > self.maxLength: self.log.debug("Excluding Title No.: {}, Title: {}. Exceeds maxLength".format( titleNo, self._read_mkv_messages("TINFO", titleNo, 27) )) continue if self.vidType == "movie" and not re.search('00', self._read_mkv_messages("TINFO", titleNo, 27)[0]): self.log.debug("Excluding Title No.: {}, Title: {}. Only want first title".format( titleNo, self._read_mkv_messages("TINFO", titleNo, 27) )) continue self.log.debug("MakeMKV title info: Disc Title: {}, Title No.: {}, Title: {}, ".format( self._read_mkv_messages("CINFO", 2), titleNo, self._read_mkv_messages("TINFO", titleNo, 27) )) title = self._read_mkv_messages("TINFO", titleNo, 27)[0] rename_title = utils.strip_accents(title) rename_title = utils.clean_special_chars(rename_title) self.saveFiles.append({ 'index': titleNo, 'title': title, 'rename_title': rename_title, })
def process_content(real_content, lang): real_content = utils.strip_accents(real_content) real_content = utils.remove_stopwords(real_content, lang) real_content = utils.stem(real_content, lang) return real_content
def cleanText(self, text): return utils.getTokensNoUserNoHashtag(utils.strip_accents(text))
import json from tqdm import trange, tqdm from bert_reader import BertReader from args import * from utils import strip_accents if __name__ == "__main__": QAs = convert_squad_to_list("./data/squad_v1.1/dev-v1.1.json") bert_reader = BertReader(args) all_results = [] for question_id in trange(len(QAs)): question = strip_accents(QAs[question_id]["question"]) paragraph_texts = [QAs[question_id]["context"]] id_ = QAs[question_id]["id"] paragraph_scores = [100] final_answers = bert_reader.predict(id_, question, paragraph_texts, paragraph_scores) print(question, final_answers) all_results.append(final_answers) json.dump(all_results, open("pytorch_bert_squad.json", 'w'))
def postprocessing_td014(td013, td014): table = td014.copy() table = table.merge(td013[[ 'tr005_description', 'td013_installation_ecs_id', 'surface_habitable_echantillon' ]], on='td013_installation_ecs_id') is_chaudiere = table.rpn > 0 gen_ecs_concat_txt_desc = table["tv027_type_installation"].astype( 'string').replace(np.nan, '') + ' ' gen_ecs_concat_txt_desc.loc[is_chaudiere] += 'chaudiere ' gen_ecs_concat_txt_desc += table['tv027_type_systeme'].astype( 'string').replace(np.nan, '') + ' ' gen_ecs_concat_txt_desc += table['tv027_type_installation'].astype( 'string').replace(np.nan, '') + ' ' gen_ecs_concat_txt_desc += table["tv032_type_generateur"].astype( 'string').replace(np.nan, '') + ' ' gen_ecs_concat_txt_desc += table['tv036_type_generation'].astype( 'string').replace(np.nan, '') + ' ' gen_ecs_concat_txt_desc += table['tv037_type_production'].astype( 'string').replace(np.nan, '') + ' ' gen_ecs_concat_txt_desc += table['tv040_type_generateur'].astype( 'string').replace(np.nan, '') + ' ' gen_ecs_concat_txt_desc += table["tv040_type_installation"].astype( 'string').replace(np.nan, '') + ' ' gen_ecs_concat_txt_desc += table["tr004_description"].astype( 'string').replace(np.nan, '') + ' ' gen_ecs_concat_txt_desc += table["tv045_energie"].astype('string').replace( np.nan, '') + ' ' gen_ecs_concat_txt_desc += table['tv047_type_generateur'].astype( 'string').replace(np.nan, '') + ' ' gen_ecs_concat_txt_desc += table['tr005_description'].astype( 'string').replace(np.nan, '') + ' ' gen_ecs_concat_txt_desc = gen_ecs_concat_txt_desc.str.lower().apply( lambda x: strip_accents(x)) table['gen_ecs_concat_txt_desc'] = gen_ecs_concat_txt_desc table['gen_ecs_concat_txt_desc'] = table['gen_ecs_concat_txt_desc'].apply( lambda x: clean_str(x)) # calcul gen_ecs_lib_infer par matching score text. unique_gen_ecs = table.gen_ecs_concat_txt_desc.unique() gen_ecs_lib_infer_dict = { k: affect_lib_by_matching_score(k, gen_ecs_normalized_lib_matching_dict) for k in unique_gen_ecs } table['gen_ecs_lib_infer'] = table.gen_ecs_concat_txt_desc.replace( gen_ecs_lib_infer_dict) is_pac = table.coefficient_performance > 2 table.loc[ is_pac, 'gen_ecs_lib_infer'] = "ECS thermodynamique electrique(PAC ou ballon)" ecs_ind = table.gen_ecs_lib_infer == 'ecs electrique indeterminee' stockage = table.volume_stockage > 20 table.loc[ecs_ind & stockage, 'gen_ecs_lib_infer'] = 'ballon a accumulation electrique' table.loc[ecs_ind & (~stockage), 'gen_ecs_lib_infer'] = 'ballon a accumulation electrique' table['gen_ecs_lib_infer_simp'] = table.gen_ecs_lib_infer.replace( gen_ecs_lib_simp_dict) # recupération fioul non_aff = table['gen_ecs_lib_infer'] == 'non affecte' fioul = table['tv045_energie'] == 'Fioul domestique' table.loc[fioul & non_aff, 'gen_ecs_lib_infer'] = 'chaudiere fioul' table['type_energie_ecs'] = table['tv045_energie'].replace( replace_elec_tv045_ener) table['score_gen_ecs_lib_infer'] = table['gen_ecs_lib_infer'].replace( sys_principal_score_lib).astype(float) return table