Python clean_dir示例

编程语言: Python

命名空间/包名称: utils.utils

方法/功能: clean_dir

hotexamples.com的示例: 7

Python clean_dir - 已找到7个示例。这些是从开源项目中提取的最受好评的utils.utils.clean_dir现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： cs_auxiliary.py 项目： developmentseed/climatescope-data

def main():

  #############################################################################
  # 0.
  #

  # Check if tmp folder exists, otherwise create it
  check_create_folder(settings.tmp_dir)
  
  # Build the list with countries and states
  admin_areas = get_aa_list()

  for chart in settings.charts:
    ind_source = settings.src_auxiliary + str(settings.current_edition) + '-' + str(chart["id"]) + '.csv'
      
    global_avg = False
    # Calculate the global average for this chart    
    if "global_average" in chart and chart["global_average"]:
      global_avg = get_avg(chart, ind_source)
    
    for aa in admin_areas:
      iso = aa.lower()
      for lang in settings.langs:
        # Initialize the array that will be written to JSON
        json_data = {"name": iso, "iso": iso, "meta": {"title": chart["title"][lang], "label-x": chart["labelx"][lang], "label-y": chart["labely"][lang]}, "data": []}

        for serie in chart["series"]:
          if serie["id"] == 'country':
            # If we're dealing with a country, use the country name as label of serie
            serie_name = aa
          else:
            serie_name = serie["name"][lang]

          # Initialize the object for the serie    
          serie_to_append = {"name": serie_name, "id": serie["id"], "values": []}

          # Add a note to the serie
          if chart["note"]:
            serie_to_append["note"] = add_note(serie, ind_source, aa)

          # Generate the actual data
          serie_to_append["values"] = chart['function'](serie, ind_source, lang, aa, chart["years"],global_avg)

          json_data["data"].append(serie_to_append)

        # Write the list to a JSON file
        file_path = (settings.exp_aux_json).format(lang=lang,indicator=chart["export"],aa=iso)
        write_json(file_path, json_data)
  
  # Fully remove the temp directory
  clean_dir(settings.tmp_dir, True)

  print "All done. The auxiliary data has been prepared for use on global-climatescope.org."

示例#2

显示文件

文件： cs_auxiliary.py 项目： danlopez00/climatescope-data

def main():

  #############################################################################
  # 0.
  #

  # Check if tmp folder exists, otherwise create it
  check_create_folder(settings.tmp_dir)
  
  # Build the list with countries and states
  admin_areas = get_aa_list()

  for chart in settings.charts:
    ind_source = settings.src_auxiliary + str(settings.current_edition) + '-' + str(chart["id"]) + '.csv'
      
    global_avg = False
    # Calculate the global average for this chart    
    if "global_average" in chart and chart["global_average"]:
      global_avg = get_avg(chart, ind_source)
    
    for aa in admin_areas:
      iso = aa.lower()
      for lang in settings.langs:
        # Initialize the array that will be written to JSON
        json_data = {"name": iso, "iso": iso, "meta": {"title": chart["title"][lang], "label-x": chart["labelx"][lang], "label-y": chart["labely"][lang]}, "data": []}

        for serie in chart["series"]:
          if serie["id"] == 'country':
            # If we're dealing with a country, use the country name as label of serie
            serie_name = aa
          else:
            serie_name = serie["name"][lang]

          # Initialize the object for the serie    
          serie_to_append = {"name": serie_name, "id": serie["id"], "values": []}

          # Generate the actual data
          serie_to_append["values"] = chart['function'](serie, ind_source, lang, aa, chart["years"],global_avg)

          json_data["data"].append(serie_to_append)

        # Write the list to a JSON file
        file_path = (settings.exp_aux_json).format(lang=lang,indicator=chart["export"],aa=iso)
        write_json(file_path, json_data)
  
  # Fully remove the temp directory
  clean_dir(settings.tmp_dir, True)

  print "All done. The auxiliary data has been prepared for use on global-climatescope.org."

示例#3

显示文件

def run(keyword, title_matching=False):
    per_search = 100
    init_results = search(keyword, per_search, offset=0)
    total = init_results['total']
    total_search = total // per_search
    insert_search_log(keyword, total)
    output_dir = f'{dw_path}/{keyword}'
    make_dir(output_dir)
    keyword_id = get_keyword_id(keyword)
    print(f'{total} models found')

    for i in range(total_search + 1):
        results = search(keyword, per_search, offset=i * per_search)
        for item in tqdm(results['entries']):
            try:
                id = item['id']
                name = filter_escape_char(item['title'])

                if is_model(id):
                    continue

                if title_matching and keyword not in item['title'].lower():
                    continue

                zip_file = download(output_dir, item)
                if not zip_file:
                    continue

                unzipped_dir = unzip_file(zip_file)
                files = filter_files(unzipped_dir)
                for file in files:
                    moved_file = move_file(join(unzipped_dir, file),
                                           output_dir)
                    obj_file = convert_to_obj(moved_file)

                    # if 'bot_smontage' in item['binaryNames']:
                    #     image = item['binaries']['bot_smontage']['contentUrl']
                    # else:
                    image = item['binaries']['bot_lt']['contentUrl']

                    insert_dw_file(id, name, image, obj_file, keyword_id)

                shutil.rmtree(unzipped_dir)

            except Exception as e:
                logging.error(f'[{keyword}]:{e}')

    clean_dir(output_dir)
    create_image(output_dir)

示例#4

显示文件

文件： cluster_plus.py 项目： august0228/TextCluster

def main():
    global connection, cursor
    cpu = multiprocessing.cpu_count()
    print("CPU {}".format(cpu))
    # preliminary work
    check_file(args.infile)
    ensure_dir(args.output)
    all_lines = 0
    if args.name_len_update:
        line_cnt = line_counter(args.infile)
        args.name_len = len(str(line_cnt)) + 1

    clean_dir(args.output, args.name_len)
    # end preliminary work

    all_bucked = defaultdict(list)
    p_bucket = defaultdict(list)
    save_idx = 0
    id_name = '{0:0' + str(args.name_len) + 'd}'

    # load tokenizer

    print('Splitting sentence into different clusters ...')
    infile = open(args.infile, 'r', encoding="utf-8")
    i = 0
    all_data = infile.readlines()
    n = 10000  # 大列表中几个数据组成一个小列表
    lstgs = [all_data[i:i + n] for i in range(0, len(all_data), n)]
    print(len(lstgs))
    r = []
    tr = []
    pool = multiprocessing.Pool(processes=4)
    for xyz in lstgs:
        tr.append(pool.apply_async(fenci, (xyz, )))
    pool.close()
    pool.join()

    for res in tr:
        tmp = res.get()
        for z in tmp:
            if z not in jieba_cache.keys():
                jieba_cache[z] = tmp[z]
            else:
                print(z)
    for st in stop_words:
        stop_words_cache[st] = 1

    r.clear()
    r = None

    all_lines = len(jieba_cache)
    print("开始执行 总 {} 行".format(all_lines))
    print("缓存成功jieba {}".format(len(jieba_cache)))
    print("缓存成功停用词 {}".format(len(stop_words_cache)))
    all_data = jieba_cache.keys()
    for inline in all_data:
        if inline == '太原去贵阳怎么走':
            print("")
        i = i + 1
        print("当前第 {} 行----总 {}".format(i, all_lines))
        inline = inline.rstrip()
        line = inline.split(':::')[0]
        is_match = False
        seg_list = jieba_cache[line]
        llll = []
        if stop_words:
            for mmmm in seg_list:
                if mmmm not in stop_words_cache.keys():
                    llll.append(mmmm)
            seg_list = llll
        for wd in seg_list:
            if is_match:
                break
            w_bucket = p_bucket[wd]
            for bucket in w_bucket:
                array = all_bucked[bucket]
                selected = sample_dict(array, args.sample_number)
                selected = list(map(lambda x: x.split(':::')[0], selected))
                selected = list(map(lambda x: jieba_cache[x], selected))
                # remove stop words
                if stop_words:
                    filt_selected = list()
                    for sen in selected:
                        llll = []
                        for mmmm in sen:
                            if mmmm not in stop_words_cache.keys():
                                llll.append(mmmm)
                        filt_selected.append(llll)
                    selected = filt_selected
                # calculate similarity with each bucket
                if all(
                        jaccard(seg_list, cmp_list) > args.threshold
                        for cmp_list in selected):
                    is_match = True
                    all_bucked[bucket].append(line)
                    for w in seg_list:
                        if bucket not in p_bucket[w]:
                            p_bucket[w].append(bucket)
                    break
                # print("{} jaccard耗时 {}".format( inline, endtime - starttime))
        if not is_match:
            bucket_name = ('tmp' + id_name).format(save_idx)
            bucket_array = [line]
            all_bucked[bucket_name] = bucket_array
            for w in seg_list:
                p_bucket[w].append(bucket_name)
            save_idx += 1

    infile.close()

    batch_size = 0
    for zzzz in all_bucked:
        batch_size = batch_size + 1
        connection = pymysql.connect(host='47.99.87.74',
                                     user='******',
                                     password='******',
                                     db='august',
                                     port=33306)
        cursor = connection.cursor()

        all_bucked_data = []
        for zx in all_bucked[zzzz]:
            all_bucked_data.append([all_bucked[zzzz][0], zx, today])
        print("当前批次  {} 共 {}".format(batch_size, len(all_bucked)))
        cursor.executemany(
            "insert into 凤巢长尾词分组(group_id,keyword,created_date) values(%s,%s,%s)",
            (all_bucked_data))
        connection.commit()
        cursor.close()
        connection.close()

    print('All is well')

示例#5

显示文件

    def run(self, questions):
        args = self._get_parser()

        # preliminary work
        ensure_dir(args.output)

        if args.name_len_update:
            line_cnt = line_counter(args.infile)
            args.name_len = len(str(line_cnt)) + 1

        clean_dir(args.output, args.name_len)
        # end preliminary work

        p_bucket = defaultdict(list)
        save_idx = 0
        id_name = '{0:0' + str(args.name_len) + 'd}'
        # load stop words
        stop_words = get_stop_words(args.stop_words) if os.path.exists(
            args.stop_words) else list()
        # load tokenizer
        seg = Segmentor(args)

        print('Splitting sentence into different clusters ...')
        infile = questions
        for inline in tqdm(infile):
            inline = inline.rstrip()
            line = inline.split(':::')[0]
            is_match = False
            seg_list = list(seg.cut(line))
            if stop_words:
                seg_list = list(filter(lambda x: x not in stop_words,
                                       seg_list))
            for wd in seg_list:
                if is_match:
                    break
                w_bucket = p_bucket[wd]
                for bucket in w_bucket:
                    bucket_path = os.path.join(args.output, bucket)
                    check_file(bucket_path)
                    selected = sample_file(bucket_path, args.sample_number)
                    selected = list(map(lambda x: x.split(':::')[0], selected))
                    selected = list(map(lambda x: list(seg.cut(x)), selected))
                    # remove stop words
                    if stop_words:
                        filt_selected = list()
                        for sen in selected:
                            sen = list(
                                filter(lambda x: x not in stop_words, sen))
                            filt_selected.append(sen)
                        selected = filt_selected
                    # calculate similarity with each bucket
                    if all(
                            jaccard(seg_list, cmp_list) > args.threshold
                            for cmp_list in selected):
                        is_match = True
                        with open(bucket_path, 'a',
                                  encoding='utf-8') as outfile:
                            outfile.write(line + '\n')
                        for w in seg_list:
                            if bucket not in p_bucket[w]:
                                p_bucket[w].append(bucket)
                        break
            if not is_match:
                bucket_name = ('tmp' + id_name).format(save_idx)
                bucket_path = os.path.join(args.output, bucket_name)
                with open(bucket_path, 'a', encoding='utf-8') as outfile:
                    outfile.write(line + '\n')
                for w in seg_list:
                    p_bucket[w].append(bucket_name)
                save_idx += 1

        # sort and rename file
        file_list = os.listdir(args.output)
        file_list = list(filter(lambda x: x.startswith('tmp'), file_list))
        cnt = dict()
        for file in file_list:
            file_path = os.path.join(args.output, file)
            cnt[file] = line_counter(file_path)

        sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
        name_map = dict()
        for idx, (file_name, times) in enumerate(sorted_cnt):
            origin_path = os.path.join(args.output, file_name)
            new_name = id_name.format(idx)
            new_path = os.path.join(args.output, new_name)
            os.rename(origin_path, new_path)
            name_map[file_name] = new_name

        for k, v in p_bucket.items():
            p_bucket[k] = list(map(lambda x: name_map[x], v))

        #合并文件
        output_file = os.path.join(args.output, 'all_cluster.txt')
        try:
            if os.path.isfile(output_file):
                os.unlink(output_file)
        except Exception as e:
            print(e)
        file_list = os.listdir(args.output)
        fw = open(output_file, 'w+')
        for file in file_list:
            with open(os.path.join(args.output, file)) as f:
                for line in f.readlines():
                    fw.write(str(int(file)) + ',' + line)
        fw.close()
        df = pd.read_csv(output_file, names=['id', 'text'])
        df.columns = ['cluster_id', 'ques']
        print('All is well')
        # json.dumps(dict(ques=ques))
        df_dict = df.set_index('cluster_id').T.to_dict('records')[0]

        #dataframe 的数据格式转换
        #df 0 aa
        #   0 aaa                   => aa  [aaa]
        #   1 bb                       bb  []
        #df_dict = {0: aa, 1: bb}
        print(df_dict)
        result_dict = {}
        for cluster_id, ques in df_dict.items():
            li = df[df['cluster_id'] == cluster_id].ques.values.tolist()
            # if(ques in li): li.remove(ques)
            result_dict[ques] = li

        my_list = [result_dict]
        my_df = pd.DataFrame(my_list).T
        my_df = my_df.reset_index()
        my_df.columns = ['ques', 'info']
        print(my_df)
        return my_df.to_json(orient="records", force_ascii=False)

示例#6

显示文件

def main():

  #############################################################################
  # 0.
  #

  # Check if tmp folder exists, otherwise create it
  if check_dir(settings.tmp_dir) == True:
    sys.exit(0)
  else:
    os.makedirs(settings.tmp_dir)

  # Run some checks on the source folder with core data.
  if not get_years():
    # Is there anything in the source folder to begin with?
    print "We were not able to find a XLSX file with core data in the folder: "\
          "%s. Make sure this folder contains at least one XLSX file named "\
          "after the year (eg. 2014.xlsx). Check the readme for more info "\
          "about the required structure of these files.\n"\
          "Quiting..." % (settings.src_core)
    sys.exit(0)

  # Provide feedback that the script only processes XLSX files with properly
  # formatted filenames. (eg. 2014.xlsx)
  fn_pattern = re.compile('^20[0-9]{2}$')
  for f in os.listdir(settings.src_core):
    fn = os.path.splitext(f)[0]
    ext = os.path.splitext(f)[-1].lower()
    path = os.path.join(settings.src_core, fn)
    
    if not os.path.isdir(path):
      # Only check files
      if ext == ".xlsx":
        if not fn_pattern.match(fn):
          print "The XLSX file %s doesn't have a properly formatted year as "\
                "filename and will be ignored." % (f)
      else:
        print "The script only processes XLSX files. %s will be ignored." % (f)


  print "Loading the core and meta data..."

  # Build the different sets of admin areas with things we have to loop over.
  countries = build_set('country','type','iso',settings.src_meta_aa)
  states = build_set('state','type','iso',settings.src_meta_aa)
  admin_areas = countries | states
  
  # Build sets for the variables we loop over
  global index_param
  index_param = build_set('param','type','id',settings.src_meta_index)
  index_score = build_set('score','type','id',settings.src_meta_index)
  sp = list(index_score | index_param)

  # Build set for the years we're interested in
  global years
  years = get_years()
  global current_yr
  current_yr = max(years)


  # Read in the files with meta-data and set the scope to global
  global df_meta_aa
  df_meta_aa = pd.read_csv(settings.src_meta_aa,index_col='iso')
  global df_meta_index
  df_meta_index = pd.read_csv(settings.src_meta_index,index_col='id')


  #############################################################################
  # 1. Store the relevant core data in one DF (df_full)
  #
  #
  # Output: df_full
  #
  #             2014            2015
  # iso   ind   value   data    value   data
  # AR    0     1.2420  NaN     1.2235  NaN
  #       1.01  0.1802  78.17   0.1795  75.16
  # ...


  first_yr = True

  for yr in years:
    # All core data files are named after the year of the edition
    fn = settings.src_core + yr + '.xlsx'

    df_yr = pd.DataFrame()
    for sheet in settings.core_data_sheets:
      
      # Build an index to parse only the relevant columns
      cols_index = build_col_index(fn,sheet)

      # Read Excel (parsing only relevant cols)
      df_sheet = pd.read_excel(fn,sheet,parse_cols=cols_index)

      # Ensure that the iso codes don't contain strange characters. They can only
      # contain letters, numbers and hyphens. (eg. CN, CN-65 or IN-MP)
      df_sheet['iso'].replace(to_replace='[^a-zA-Z0-9-]', value='',inplace=True,regex=True) 

      # Append each sheet to a dataframe holding the data for that year
      df_yr = df_yr.append(df_sheet)

    # Set the index of the DF to the ISO code and ID of the indicator
    df_yr.set_index(['iso','id'],inplace=True)
    # Make sure the index is sorted so the slicing works well
    df_yr.sortlevel(inplace=True)

    # Rename the column 'score' to value
    df_yr.rename(columns={'score':'value'}, inplace=True)

    
    # Add an extra level in the hierarchy of the columns (Mutli-index)
    # containing an indication of the year

    # Create list that repeats 'value' for the amount of years available
    c = [yr] * len(df_yr.columns)
    # Add a level to the cols
    df_yr.columns = [c, df_yr.columns]

    if first_yr:
      # If it's the first year, we initialize the full DataFrame
      df_full = df_yr
      first_yr = False
    else:
      # Every subsequent year will have to be merged into df_full
      df_full = pd.merge(df_full,df_yr,how='outer',left_index=True,right_index=True)

  df_full.sortlevel(axis=1,inplace=True)

  #############################################################################
  # 2. CSV downloads
  #
  # For all the CSV exports, prepare a dataframe that combines the data with
  # the meta.

  print "Building the CSV files for the download section..."

  # For the CSV, we're only interested in the value column of each year
  df_full_csv = df_full.loc[:,(slice(None),'value')]
  df_full_csv.columns = df_full_csv.columns.get_level_values(0)

  # The full DF is a multi-index. Since the meta-files have a single index,
  # it is necessary to reset the indexes before joining on the column.
  df_full_csv = df_full_csv.reset_index()
  df_meta_aa_csv = df_meta_aa.reset_index()
  df_meta_index_csv = df_meta_index.reset_index()

  # Merge the country meta
  df_full_csv = pd.merge(df_full_csv,df_meta_aa_csv,on='iso')

  # Merge the index meta data
  df_full_csv = pd.merge(df_full_csv,df_meta_index_csv,on='id',suffixes=('_aa','_var'))

  # Re-index the DF on iso & id  and make sure it's sorted
  df_full_csv.set_index(['iso','id'],inplace=True)
  df_full_csv.sortlevel(inplace=True)

  # 2.0 Export the full dataset to CSV

  for lang in settings.langs:
    # Build a list with the meta-data that needs to be included
    columns = ['name:' + lang + '_aa','name:' + lang + '_var','type_var']
    columns = columns + list(years)

    file_path = (settings.exp_full_csv).format(lang=lang)
    df_full_csv.loc[slice(None),columns].to_csv(file_path,encoding='UTF-8',index=False)
  

  # 2.1 Generate the main CSV files

  # Slice the DF to only contain the score and parameters for the current year.
  df_main_csv = df_full_csv.loc[(slice(None),sp),:]

  for lang in settings.langs:
    # Pivot the DF and export it
    file_path = (settings.exp_current_csv).format(lang=lang, yr=current_yr)
    pivot_df(df_main_csv,'name:' + lang + '_aa','name:' + lang + '_var',current_yr).to_csv(file_path,encoding='UTF-8')


  # 2.3 Generate the country + state CSV files
  for aa in admin_areas:
    # Select the data of this admin area
    df_aa_csv = df_full_csv.loc[(aa,slice(None)),:]
    for lang in settings.langs:
      # Include the name of the var, its type and the years
      columns = ['name:' + lang + '_var','type_var'] + list(years)

      # Select the proper columns and generate the CSV
      file_path = (settings.exp_aa_csv).format(lang = lang, aa = aa.lower())
      df_aa_csv.loc[slice(None),columns].to_csv(file_path,encoding='UTF-8',index=False)


  #############################################################################
  # 3. Calculate the rankings
  #
  #
  # Output: df_full
  #
  #             2014                    2015
  #             value   data  gr  sr    value  data  gr  sr
  # iso   id
  # AR    0     1.2420  NaN   13  NaN   1.2235 NaN   12  NaN
  #       1.01  0.1802  73.1  5   NaN   0.1795 75.8  6   NaN
  # ...


  print "Calculating the ranking..."

  # 3.0 Prepare the structure
  # Add placeholder cols with NaN that can be updated later with df.update()
  for year in years:
    for rank in ('gr', 'sr'):
      df_full[(year,rank)] = np.nan
  # Make sure its sorted
  df_full.sortlevel(axis=1,inplace=True)

 
  # 3.1 Global rank
  # The global rank (gr) is a rank of all the COUNTRIES in the project
  df_full = get_rank(countries,df_full,'gr')


  # 3.3 State rank
  # The state rank ('sr') ranks the STATES of a particular country
  for country in countries:
    # Check if there are any states or provinces for this country
    cs = build_set(country,'country','iso',settings.src_meta_aa)
    if cs:
      df_full = get_rank(cs,df_full,'sr')


  #############################################################################
  # 4. JSON api
  #

  print "Building the JSON files for the API..."

  # 4.1 Generate the main JSON file
  for lang in settings.langs:
    # The JSON will contain a list with dicts
    json_data = []
    
    # Loop over the countries list
    for country in countries:
      country_data = build_json_aa(country,df_full,lang, historic=True)
      # Sort the list of states / provinces
      if country_data['states']:
        country_data['states'] = sorted(country_data['states'], key=lambda k: k['name'])
      json_data.append(country_data)

    # Sort the list of countries by name
    sorted_data = sorted(json_data, key=lambda k: k['name'])

    # Write the list to a JSON file
    file_path = (settings.exp_core).format(lang=lang)
    write_json(file_path, sorted_data)


  # 4.3 Generate the country + state JSON files
  for aa in admin_areas:
    for lang in settings.langs:
      # Get the data for this admin area in a dict
      json_data = build_json_aa(aa,df_full,lang,indicators=True,historic=True)

      # Write the dict to a JSON file
      file_path = (settings.exp_aa).format(lang=lang,aa=aa.lower())
      write_json(file_path, json_data)


  # Fully remove the temp directory
  clean_dir(settings.tmp_dir , True)

  print "All done. The data has been prepared for use on global-climatescope.org."

示例#7

显示文件

文件： cluster.py 项目： sjyttkl/TextCluster

def main():
    args = _get_parser()

    # preliminary work
    check_file(args.infile)
    ensure_dir(args.output)

    if args.name_len_update:
        line_cnt = line_counter(args.infile)
        args.name_len = len(str(line_cnt)) + 1

    clean_dir(args.output, args.name_len)
    # end preliminary work

    p_bucket = defaultdict(list)
    save_idx = 0
    id_name = '{0:0' + str(args.name_len) + 'd}'
    # load stop words
    stop_words = get_stop_words(args.stop_words) if os.path.exists(
        args.stop_words) else list()
    # load tokenizer
    seg = Segmentor(args)

    print('Splitting sentence into different clusters ...')
    infile = open(args.infile, 'r', encoding="utf-8")
    for line in tqdm(infile):
        line = line.rstrip()
        is_match = False
        seg_list = list(seg.cut(line))
        if stop_words:
            seg_list = list(filter(lambda x: x not in stop_words, seg_list))
        for wd in seg_list:
            if is_match:
                break
            w_bucket = p_bucket[wd]
            for bucket in w_bucket:
                bucket_path = os.path.join(args.output, bucket)
                check_file(bucket_path)
                selected = sample_file(bucket_path, args.sample_number)
                selected = list(map(lambda x: list(seg.cut(x)), selected))
                # remove stop words
                if stop_words:
                    filt_selected = list()
                    for sen in selected:
                        sen = list(filter(lambda x: x not in stop_words, sen))
                        filt_selected.append(sen)
                    selected = filt_selected
                # calculate similarity with each bucket
                if all(
                        jaccard(seg_list, cmp_list) > args.threshold
                        for cmp_list in selected):
                    is_match = True
                    with open(bucket_path, 'a', encoding='utf-8') as outfile:
                        outfile.write(line + '\n')
                    for w in seg_list:
                        if bucket not in p_bucket[w]:
                            p_bucket[w].append(bucket)
                    break
        if not is_match:
            bucket_name = ('tmp' + id_name).format(save_idx)
            bucket_path = os.path.join(args.output, bucket_name)
            with open(bucket_path, 'a', encoding='utf-8') as outfile:
                outfile.write(line + '\n')
            for w in seg_list:
                p_bucket[w].append(bucket_name)
            save_idx += 1

    infile.close()

    # sort and rename file
    file_list = os.listdir(args.output)
    file_list = list(filter(lambda x: x.startswith('tmp'), file_list))
    cnt = dict()
    for file in file_list:
        file_path = os.path.join(args.output, file)
        cnt[file] = line_counter(file_path)

    sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
    for idx, (file_name, times) in enumerate(sorted_cnt):
        origin_path = os.path.join(args.output, file_name)
        new_path = os.path.join(args.output, id_name.format(idx))
        os.rename(origin_path, new_path)

    print('All is well')