def main(): ############################################################################# # 0. # # Check if tmp folder exists, otherwise create it check_create_folder(settings.tmp_dir) # Build the list with countries and states admin_areas = get_aa_list() for chart in settings.charts: ind_source = settings.src_auxiliary + str(settings.current_edition) + '-' + str(chart["id"]) + '.csv' global_avg = False # Calculate the global average for this chart if "global_average" in chart and chart["global_average"]: global_avg = get_avg(chart, ind_source) for aa in admin_areas: iso = aa.lower() for lang in settings.langs: # Initialize the array that will be written to JSON json_data = {"name": iso, "iso": iso, "meta": {"title": chart["title"][lang], "label-x": chart["labelx"][lang], "label-y": chart["labely"][lang]}, "data": []} for serie in chart["series"]: if serie["id"] == 'country': # If we're dealing with a country, use the country name as label of serie serie_name = aa else: serie_name = serie["name"][lang] # Initialize the object for the serie serie_to_append = {"name": serie_name, "id": serie["id"], "values": []} # Add a note to the serie if chart["note"]: serie_to_append["note"] = add_note(serie, ind_source, aa) # Generate the actual data serie_to_append["values"] = chart['function'](serie, ind_source, lang, aa, chart["years"],global_avg) json_data["data"].append(serie_to_append) # Write the list to a JSON file file_path = (settings.exp_aux_json).format(lang=lang,indicator=chart["export"],aa=iso) write_json(file_path, json_data) # Fully remove the temp directory clean_dir(settings.tmp_dir, True) print "All done. The auxiliary data has been prepared for use on global-climatescope.org."
def main(): ############################################################################# # 0. # # Check if tmp folder exists, otherwise create it check_create_folder(settings.tmp_dir) # Build the list with countries and states admin_areas = get_aa_list() for chart in settings.charts: ind_source = settings.src_auxiliary + str(settings.current_edition) + '-' + str(chart["id"]) + '.csv' global_avg = False # Calculate the global average for this chart if "global_average" in chart and chart["global_average"]: global_avg = get_avg(chart, ind_source) for aa in admin_areas: iso = aa.lower() for lang in settings.langs: # Initialize the array that will be written to JSON json_data = {"name": iso, "iso": iso, "meta": {"title": chart["title"][lang], "label-x": chart["labelx"][lang], "label-y": chart["labely"][lang]}, "data": []} for serie in chart["series"]: if serie["id"] == 'country': # If we're dealing with a country, use the country name as label of serie serie_name = aa else: serie_name = serie["name"][lang] # Initialize the object for the serie serie_to_append = {"name": serie_name, "id": serie["id"], "values": []} # Generate the actual data serie_to_append["values"] = chart['function'](serie, ind_source, lang, aa, chart["years"],global_avg) json_data["data"].append(serie_to_append) # Write the list to a JSON file file_path = (settings.exp_aux_json).format(lang=lang,indicator=chart["export"],aa=iso) write_json(file_path, json_data) # Fully remove the temp directory clean_dir(settings.tmp_dir, True) print "All done. The auxiliary data has been prepared for use on global-climatescope.org."
def run(keyword, title_matching=False): per_search = 100 init_results = search(keyword, per_search, offset=0) total = init_results['total'] total_search = total // per_search insert_search_log(keyword, total) output_dir = f'{dw_path}/{keyword}' make_dir(output_dir) keyword_id = get_keyword_id(keyword) print(f'{total} models found') for i in range(total_search + 1): results = search(keyword, per_search, offset=i * per_search) for item in tqdm(results['entries']): try: id = item['id'] name = filter_escape_char(item['title']) if is_model(id): continue if title_matching and keyword not in item['title'].lower(): continue zip_file = download(output_dir, item) if not zip_file: continue unzipped_dir = unzip_file(zip_file) files = filter_files(unzipped_dir) for file in files: moved_file = move_file(join(unzipped_dir, file), output_dir) obj_file = convert_to_obj(moved_file) # if 'bot_smontage' in item['binaryNames']: # image = item['binaries']['bot_smontage']['contentUrl'] # else: image = item['binaries']['bot_lt']['contentUrl'] insert_dw_file(id, name, image, obj_file, keyword_id) shutil.rmtree(unzipped_dir) except Exception as e: logging.error(f'[{keyword}]:{e}') clean_dir(output_dir) create_image(output_dir)
def main(): global connection, cursor cpu = multiprocessing.cpu_count() print("CPU {}".format(cpu)) # preliminary work check_file(args.infile) ensure_dir(args.output) all_lines = 0 if args.name_len_update: line_cnt = line_counter(args.infile) args.name_len = len(str(line_cnt)) + 1 clean_dir(args.output, args.name_len) # end preliminary work all_bucked = defaultdict(list) p_bucket = defaultdict(list) save_idx = 0 id_name = '{0:0' + str(args.name_len) + 'd}' # load tokenizer print('Splitting sentence into different clusters ...') infile = open(args.infile, 'r', encoding="utf-8") i = 0 all_data = infile.readlines() n = 10000 # 大列表中几个数据组成一个小列表 lstgs = [all_data[i:i + n] for i in range(0, len(all_data), n)] print(len(lstgs)) r = [] tr = [] pool = multiprocessing.Pool(processes=4) for xyz in lstgs: tr.append(pool.apply_async(fenci, (xyz, ))) pool.close() pool.join() for res in tr: tmp = res.get() for z in tmp: if z not in jieba_cache.keys(): jieba_cache[z] = tmp[z] else: print(z) for st in stop_words: stop_words_cache[st] = 1 r.clear() r = None all_lines = len(jieba_cache) print("开始执行 总 {} 行".format(all_lines)) print("缓存成功jieba {}".format(len(jieba_cache))) print("缓存成功停用词 {}".format(len(stop_words_cache))) all_data = jieba_cache.keys() for inline in all_data: if inline == '太原去贵阳怎么走': print("") i = i + 1 print("当前第 {} 行----总 {}".format(i, all_lines)) inline = inline.rstrip() line = inline.split(':::')[0] is_match = False seg_list = jieba_cache[line] llll = [] if stop_words: for mmmm in seg_list: if mmmm not in stop_words_cache.keys(): llll.append(mmmm) seg_list = llll for wd in seg_list: if is_match: break w_bucket = p_bucket[wd] for bucket in w_bucket: array = all_bucked[bucket] selected = sample_dict(array, args.sample_number) selected = list(map(lambda x: x.split(':::')[0], selected)) selected = list(map(lambda x: jieba_cache[x], selected)) # remove stop words if stop_words: filt_selected = list() for sen in selected: llll = [] for mmmm in sen: if mmmm not in stop_words_cache.keys(): llll.append(mmmm) filt_selected.append(llll) selected = filt_selected # calculate similarity with each bucket if all( jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected): is_match = True all_bucked[bucket].append(line) for w in seg_list: if bucket not in p_bucket[w]: p_bucket[w].append(bucket) break # print("{} jaccard耗时 {}".format( inline, endtime - starttime)) if not is_match: bucket_name = ('tmp' + id_name).format(save_idx) bucket_array = [line] all_bucked[bucket_name] = bucket_array for w in seg_list: p_bucket[w].append(bucket_name) save_idx += 1 infile.close() batch_size = 0 for zzzz in all_bucked: batch_size = batch_size + 1 connection = pymysql.connect(host='47.99.87.74', user='******', password='******', db='august', port=33306) cursor = connection.cursor() all_bucked_data = [] for zx in all_bucked[zzzz]: all_bucked_data.append([all_bucked[zzzz][0], zx, today]) print("当前批次 {} 共 {}".format(batch_size, len(all_bucked))) cursor.executemany( "insert into 凤巢长尾词分组(group_id,keyword,created_date) values(%s,%s,%s)", (all_bucked_data)) connection.commit() cursor.close() connection.close() print('All is well')
def run(self, questions): args = self._get_parser() # preliminary work ensure_dir(args.output) if args.name_len_update: line_cnt = line_counter(args.infile) args.name_len = len(str(line_cnt)) + 1 clean_dir(args.output, args.name_len) # end preliminary work p_bucket = defaultdict(list) save_idx = 0 id_name = '{0:0' + str(args.name_len) + 'd}' # load stop words stop_words = get_stop_words(args.stop_words) if os.path.exists( args.stop_words) else list() # load tokenizer seg = Segmentor(args) print('Splitting sentence into different clusters ...') infile = questions for inline in tqdm(infile): inline = inline.rstrip() line = inline.split(':::')[0] is_match = False seg_list = list(seg.cut(line)) if stop_words: seg_list = list(filter(lambda x: x not in stop_words, seg_list)) for wd in seg_list: if is_match: break w_bucket = p_bucket[wd] for bucket in w_bucket: bucket_path = os.path.join(args.output, bucket) check_file(bucket_path) selected = sample_file(bucket_path, args.sample_number) selected = list(map(lambda x: x.split(':::')[0], selected)) selected = list(map(lambda x: list(seg.cut(x)), selected)) # remove stop words if stop_words: filt_selected = list() for sen in selected: sen = list( filter(lambda x: x not in stop_words, sen)) filt_selected.append(sen) selected = filt_selected # calculate similarity with each bucket if all( jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected): is_match = True with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: if bucket not in p_bucket[w]: p_bucket[w].append(bucket) break if not is_match: bucket_name = ('tmp' + id_name).format(save_idx) bucket_path = os.path.join(args.output, bucket_name) with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: p_bucket[w].append(bucket_name) save_idx += 1 # sort and rename file file_list = os.listdir(args.output) file_list = list(filter(lambda x: x.startswith('tmp'), file_list)) cnt = dict() for file in file_list: file_path = os.path.join(args.output, file) cnt[file] = line_counter(file_path) sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True) name_map = dict() for idx, (file_name, times) in enumerate(sorted_cnt): origin_path = os.path.join(args.output, file_name) new_name = id_name.format(idx) new_path = os.path.join(args.output, new_name) os.rename(origin_path, new_path) name_map[file_name] = new_name for k, v in p_bucket.items(): p_bucket[k] = list(map(lambda x: name_map[x], v)) #合并文件 output_file = os.path.join(args.output, 'all_cluster.txt') try: if os.path.isfile(output_file): os.unlink(output_file) except Exception as e: print(e) file_list = os.listdir(args.output) fw = open(output_file, 'w+') for file in file_list: with open(os.path.join(args.output, file)) as f: for line in f.readlines(): fw.write(str(int(file)) + ',' + line) fw.close() df = pd.read_csv(output_file, names=['id', 'text']) df.columns = ['cluster_id', 'ques'] print('All is well') # json.dumps(dict(ques=ques)) df_dict = df.set_index('cluster_id').T.to_dict('records')[0] #dataframe 的数据格式转换 #df 0 aa # 0 aaa => aa [aaa] # 1 bb bb [] #df_dict = {0: aa, 1: bb} print(df_dict) result_dict = {} for cluster_id, ques in df_dict.items(): li = df[df['cluster_id'] == cluster_id].ques.values.tolist() # if(ques in li): li.remove(ques) result_dict[ques] = li my_list = [result_dict] my_df = pd.DataFrame(my_list).T my_df = my_df.reset_index() my_df.columns = ['ques', 'info'] print(my_df) return my_df.to_json(orient="records", force_ascii=False)
def main(): ############################################################################# # 0. # # Check if tmp folder exists, otherwise create it if check_dir(settings.tmp_dir) == True: sys.exit(0) else: os.makedirs(settings.tmp_dir) # Run some checks on the source folder with core data. if not get_years(): # Is there anything in the source folder to begin with? print "We were not able to find a XLSX file with core data in the folder: "\ "%s. Make sure this folder contains at least one XLSX file named "\ "after the year (eg. 2014.xlsx). Check the readme for more info "\ "about the required structure of these files.\n"\ "Quiting..." % (settings.src_core) sys.exit(0) # Provide feedback that the script only processes XLSX files with properly # formatted filenames. (eg. 2014.xlsx) fn_pattern = re.compile('^20[0-9]{2}$') for f in os.listdir(settings.src_core): fn = os.path.splitext(f)[0] ext = os.path.splitext(f)[-1].lower() path = os.path.join(settings.src_core, fn) if not os.path.isdir(path): # Only check files if ext == ".xlsx": if not fn_pattern.match(fn): print "The XLSX file %s doesn't have a properly formatted year as "\ "filename and will be ignored." % (f) else: print "The script only processes XLSX files. %s will be ignored." % (f) print "Loading the core and meta data..." # Build the different sets of admin areas with things we have to loop over. countries = build_set('country','type','iso',settings.src_meta_aa) states = build_set('state','type','iso',settings.src_meta_aa) admin_areas = countries | states # Build sets for the variables we loop over global index_param index_param = build_set('param','type','id',settings.src_meta_index) index_score = build_set('score','type','id',settings.src_meta_index) sp = list(index_score | index_param) # Build set for the years we're interested in global years years = get_years() global current_yr current_yr = max(years) # Read in the files with meta-data and set the scope to global global df_meta_aa df_meta_aa = pd.read_csv(settings.src_meta_aa,index_col='iso') global df_meta_index df_meta_index = pd.read_csv(settings.src_meta_index,index_col='id') ############################################################################# # 1. Store the relevant core data in one DF (df_full) # # # Output: df_full # # 2014 2015 # iso ind value data value data # AR 0 1.2420 NaN 1.2235 NaN # 1.01 0.1802 78.17 0.1795 75.16 # ... first_yr = True for yr in years: # All core data files are named after the year of the edition fn = settings.src_core + yr + '.xlsx' df_yr = pd.DataFrame() for sheet in settings.core_data_sheets: # Build an index to parse only the relevant columns cols_index = build_col_index(fn,sheet) # Read Excel (parsing only relevant cols) df_sheet = pd.read_excel(fn,sheet,parse_cols=cols_index) # Ensure that the iso codes don't contain strange characters. They can only # contain letters, numbers and hyphens. (eg. CN, CN-65 or IN-MP) df_sheet['iso'].replace(to_replace='[^a-zA-Z0-9-]', value='',inplace=True,regex=True) # Append each sheet to a dataframe holding the data for that year df_yr = df_yr.append(df_sheet) # Set the index of the DF to the ISO code and ID of the indicator df_yr.set_index(['iso','id'],inplace=True) # Make sure the index is sorted so the slicing works well df_yr.sortlevel(inplace=True) # Rename the column 'score' to value df_yr.rename(columns={'score':'value'}, inplace=True) # Add an extra level in the hierarchy of the columns (Mutli-index) # containing an indication of the year # Create list that repeats 'value' for the amount of years available c = [yr] * len(df_yr.columns) # Add a level to the cols df_yr.columns = [c, df_yr.columns] if first_yr: # If it's the first year, we initialize the full DataFrame df_full = df_yr first_yr = False else: # Every subsequent year will have to be merged into df_full df_full = pd.merge(df_full,df_yr,how='outer',left_index=True,right_index=True) df_full.sortlevel(axis=1,inplace=True) ############################################################################# # 2. CSV downloads # # For all the CSV exports, prepare a dataframe that combines the data with # the meta. print "Building the CSV files for the download section..." # For the CSV, we're only interested in the value column of each year df_full_csv = df_full.loc[:,(slice(None),'value')] df_full_csv.columns = df_full_csv.columns.get_level_values(0) # The full DF is a multi-index. Since the meta-files have a single index, # it is necessary to reset the indexes before joining on the column. df_full_csv = df_full_csv.reset_index() df_meta_aa_csv = df_meta_aa.reset_index() df_meta_index_csv = df_meta_index.reset_index() # Merge the country meta df_full_csv = pd.merge(df_full_csv,df_meta_aa_csv,on='iso') # Merge the index meta data df_full_csv = pd.merge(df_full_csv,df_meta_index_csv,on='id',suffixes=('_aa','_var')) # Re-index the DF on iso & id and make sure it's sorted df_full_csv.set_index(['iso','id'],inplace=True) df_full_csv.sortlevel(inplace=True) # 2.0 Export the full dataset to CSV for lang in settings.langs: # Build a list with the meta-data that needs to be included columns = ['name:' + lang + '_aa','name:' + lang + '_var','type_var'] columns = columns + list(years) file_path = (settings.exp_full_csv).format(lang=lang) df_full_csv.loc[slice(None),columns].to_csv(file_path,encoding='UTF-8',index=False) # 2.1 Generate the main CSV files # Slice the DF to only contain the score and parameters for the current year. df_main_csv = df_full_csv.loc[(slice(None),sp),:] for lang in settings.langs: # Pivot the DF and export it file_path = (settings.exp_current_csv).format(lang=lang, yr=current_yr) pivot_df(df_main_csv,'name:' + lang + '_aa','name:' + lang + '_var',current_yr).to_csv(file_path,encoding='UTF-8') # 2.3 Generate the country + state CSV files for aa in admin_areas: # Select the data of this admin area df_aa_csv = df_full_csv.loc[(aa,slice(None)),:] for lang in settings.langs: # Include the name of the var, its type and the years columns = ['name:' + lang + '_var','type_var'] + list(years) # Select the proper columns and generate the CSV file_path = (settings.exp_aa_csv).format(lang = lang, aa = aa.lower()) df_aa_csv.loc[slice(None),columns].to_csv(file_path,encoding='UTF-8',index=False) ############################################################################# # 3. Calculate the rankings # # # Output: df_full # # 2014 2015 # value data gr sr value data gr sr # iso id # AR 0 1.2420 NaN 13 NaN 1.2235 NaN 12 NaN # 1.01 0.1802 73.1 5 NaN 0.1795 75.8 6 NaN # ... print "Calculating the ranking..." # 3.0 Prepare the structure # Add placeholder cols with NaN that can be updated later with df.update() for year in years: for rank in ('gr', 'sr'): df_full[(year,rank)] = np.nan # Make sure its sorted df_full.sortlevel(axis=1,inplace=True) # 3.1 Global rank # The global rank (gr) is a rank of all the COUNTRIES in the project df_full = get_rank(countries,df_full,'gr') # 3.3 State rank # The state rank ('sr') ranks the STATES of a particular country for country in countries: # Check if there are any states or provinces for this country cs = build_set(country,'country','iso',settings.src_meta_aa) if cs: df_full = get_rank(cs,df_full,'sr') ############################################################################# # 4. JSON api # print "Building the JSON files for the API..." # 4.1 Generate the main JSON file for lang in settings.langs: # The JSON will contain a list with dicts json_data = [] # Loop over the countries list for country in countries: country_data = build_json_aa(country,df_full,lang, historic=True) # Sort the list of states / provinces if country_data['states']: country_data['states'] = sorted(country_data['states'], key=lambda k: k['name']) json_data.append(country_data) # Sort the list of countries by name sorted_data = sorted(json_data, key=lambda k: k['name']) # Write the list to a JSON file file_path = (settings.exp_core).format(lang=lang) write_json(file_path, sorted_data) # 4.3 Generate the country + state JSON files for aa in admin_areas: for lang in settings.langs: # Get the data for this admin area in a dict json_data = build_json_aa(aa,df_full,lang,indicators=True,historic=True) # Write the dict to a JSON file file_path = (settings.exp_aa).format(lang=lang,aa=aa.lower()) write_json(file_path, json_data) # Fully remove the temp directory clean_dir(settings.tmp_dir , True) print "All done. The data has been prepared for use on global-climatescope.org."
def main(): args = _get_parser() # preliminary work check_file(args.infile) ensure_dir(args.output) if args.name_len_update: line_cnt = line_counter(args.infile) args.name_len = len(str(line_cnt)) + 1 clean_dir(args.output, args.name_len) # end preliminary work p_bucket = defaultdict(list) save_idx = 0 id_name = '{0:0' + str(args.name_len) + 'd}' # load stop words stop_words = get_stop_words(args.stop_words) if os.path.exists( args.stop_words) else list() # load tokenizer seg = Segmentor(args) print('Splitting sentence into different clusters ...') infile = open(args.infile, 'r', encoding="utf-8") for line in tqdm(infile): line = line.rstrip() is_match = False seg_list = list(seg.cut(line)) if stop_words: seg_list = list(filter(lambda x: x not in stop_words, seg_list)) for wd in seg_list: if is_match: break w_bucket = p_bucket[wd] for bucket in w_bucket: bucket_path = os.path.join(args.output, bucket) check_file(bucket_path) selected = sample_file(bucket_path, args.sample_number) selected = list(map(lambda x: list(seg.cut(x)), selected)) # remove stop words if stop_words: filt_selected = list() for sen in selected: sen = list(filter(lambda x: x not in stop_words, sen)) filt_selected.append(sen) selected = filt_selected # calculate similarity with each bucket if all( jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected): is_match = True with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: if bucket not in p_bucket[w]: p_bucket[w].append(bucket) break if not is_match: bucket_name = ('tmp' + id_name).format(save_idx) bucket_path = os.path.join(args.output, bucket_name) with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: p_bucket[w].append(bucket_name) save_idx += 1 infile.close() # sort and rename file file_list = os.listdir(args.output) file_list = list(filter(lambda x: x.startswith('tmp'), file_list)) cnt = dict() for file in file_list: file_path = os.path.join(args.output, file) cnt[file] = line_counter(file_path) sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True) for idx, (file_name, times) in enumerate(sorted_cnt): origin_path = os.path.join(args.output, file_name) new_path = os.path.join(args.output, id_name.format(idx)) os.rename(origin_path, new_path) print('All is well')