def svm_classify(train_set, train_tag, test_set, test_tag): """ 首先实现一个 linear kernel 的分类器 """ clf = svm.LinearSVC() clf_res = clf.fit(train_set, train_tag) train_pred = clf_res.predict(train_set) test_pred = clf_res.predict(test_set) train_err_num, train_err_ratio = FileUtil.checked_pred( train_tag, train_pred) test_err_num, test_err_ratio = FileUtil.checked_pred(test_tag, test_pred) print('=== 分类训练完毕,分类结果如下 ===') print('训练集误差: {e}'.format(e=train_err_ratio)) print('检验集误差: {e}'.format(e=test_err_ratio)) return clf_res
def create_account_detail_and_account_balance_files(accounts): account_details = list() account_balances = list() for account in accounts: account_number = account['accountNumber'] interest_rate = interest_rates.get(account['productName']) return_list = get_balance_and_interest_details(account, interest_rate) balance = return_list[0] account_detail = AccountDetails.generate_random_account_detail_json(account, return_list) account_balance = AccountBalance.generate_random_account_balance_json(account_number, balance) account_details.append(account_detail) account_balances.append(account_balance) FileUtil.json_to_json_file(account_details, 'generated-account-details') FileUtil.json_to_json_file(account_balances, 'generated-account-balances')
def make_date_value_dic(self) -> dict: labels = self.config.get_labels() data_dic = {} for label in labels: data_dic[label] = {} file_paths = FileUtil.get_recursive_file_paths( self.config.get_base_dir()) file_paths.sort(key=lambda x: x.split("/")[-1]) for file_path in file_paths: lines = FileUtil.read_lines(file_path) for line in lines: for label in labels: if label not in line: continue value = HabitReporter.extract_value_float(line) if value is None: continue data_dic[label][DateUtil.extract_date(file_path)] = value return data_dic
def convert_doc_to_wordlist(str_doc, cut_all): """ 将文件内容转换成词典 """ sent_list = str_doc.split('\n') # 移除一些字符, 如\u3000 空格 sent_list = map(FileUtil.rm_char, sent_list) # 分词,并且去掉停用词 word_2dlist = [ FileUtil.rm_tokens(jieba.cut(part, cut_all=cut_all)) for part in sent_list ] word_list = sum(word_2dlist, []) return word_list
def get_word_2_pinyin_dict(): path = 'testLines.txt' lines = FileUtil.get_file(path) # words_line_dict = dict() # 用于存放词与短语对象的对应关系 _normal_pinyin_phrase_dict = dict() # 用于存放全拼拼音与短语对象的对应关系 _first_pinyin_phrase_dict = dict() # 用于存放首写拼音与短语对象的对应关系 _init_pinyin_phrase_dict = dict() # 用于存放声母与短语对象的对应关系 words = [] # 用于存放一行短语的分词结果 for each_line in lines: _full_pinyin = ''.join(PinyinUtil.get_lazy_pinyin(each_line)) _init_pinyin = ''.join(PinyinUtil.get_init_pinyin(each_line)) _first_pin = ''.join(PinyinUtil.get_first_pinyin(each_line)) newPhrase = Phrase(each_line, _full_pinyin) append_new_val_in_dict(_normal_pinyin_phrase_dict, _full_pinyin, newPhrase) append_new_val_in_dict(_first_pinyin_phrase_dict, _first_pin, newPhrase) append_new_val_in_dict(_init_pinyin_phrase_dict, _init_pinyin, newPhrase) return _normal_pinyin_phrase_dict, _first_pinyin_phrase_dict, _init_pinyin_phrase_dict
if age > 33: break elif bsu_count < 1: bsu_count += 1 else: break elif account_type == 'STUDENT BRUKSKONTO': if age > 25 or age < 19: break random_int = random.randint(0, 100) if random_int <= account_types_with_probability.get( account_type): new_account = Account.generate_random_account_json( person, account_type) accounts.append(new_account) account_count += 1 return accounts if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( 'people', help='A json file containing people to generate accounts for. ' 'This file is typically output by the create_people.py script') args = parser.parse_args() with open(args.people, encoding='utf-8') as fh: FileUtil.json_to_json_file(create_list_of_account_json(json.load(fh)), 'generated-accounts')
#!/usr/bin/env python3 import argparse from models.person.person import Person from utils.file_util import FileUtil def create_list_of_people_json(number_of_people): people = list() for i in range(number_of_people): random_person = Person.generate_random() people.append(random_person.to_json()) return people parser = argparse.ArgumentParser( description="Quick hack to generate fake people and some data.") parser.add_argument('-n', type=int, default=10, help='The number of people to create') args = parser.parse_args() FileUtil.json_to_json_file(create_list_of_people_json(args.n), 'generated-people')
from create_cards_and_card_balances import create_card_balances if __name__ == "__main__": parser = argparse.ArgumentParser( description="Generate people along with associated account data.") parser.add_argument('-n', type=int, default=10, help='The number of people to create') args = parser.parse_args() # Create a list of people to use # 'generated-people' json file created people = create_list_of_people_json(args.n) print('\nCreated people\n') FileUtil.json_to_json_file(people, 'generated-people') # Create between 1 and 6 accounts for each person # 'generated-accounts' json file created accounts = create_list_of_account_json(people) print('\nCreated accounts\n') FileUtil.json_to_json_file(accounts, 'generated-accounts') # Create details for each of the accounts # 'generated-account-details' and 'generated-account-balances' json files created balances = create_account_detail_and_account_balance_files(accounts) print('\nCreated account details\n') FileUtil.json_to_json_file(balances, 'generated-card-balances') # Add fake payments for each of the accounts # 'generated-booked-transactions', 'generated-due-payments' and 'generated-reserved-transactions' json files created
lsi_model = None predictor = None if not os.path.exists(tmp_data_path): os.makedirs(tmp_data_path) # n 表示抽样率,n 抽 1 n = 10 # # =================================================================== # # # # 第一阶段, 遍历文档,生成词典,并去掉频率较少的项 # 如果指定的位置没有词典,则重新生成一个。如果有,则跳过该阶段 t0 = time.time() if not os.path.exists(path_dictionary): print('=== 未检测到有词典存在,开始遍历生成词典 ===') dictionary = corpora.Dictionary() files = FileUtil.read_file_by_dir(data_path) for i, content in enumerate(files): if i % n == 0: catg = content[0] file = content[1] file = convert_doc_to_wordlist(file, False) dictionary.add_documents([file]) if int(i / n) % 1000 == 0: print('{t} *** {i} \t docs has been dealed'.format( i=i, t=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))) # 去掉词典中出现次数过少的 small_freq_ids = [ tokenid for tokenid, docfreq in dictionary.dfs.items()
#!/usr/bin/env python # coding=UTF-8 ''' # Desc: # Author:TavisD # Time:2016-10-10 10:53 # Ver:V1.0 ''' from utils.api_client import APIClient from utils.file_util import FileUtil from utils.mysql_util import MysqlUtil from utils.gen_util import GenUtil api_client = APIClient() file_util = FileUtil() mysql_util = MysqlUtil() gen_util = GenUtil()
account_count = 0 bsu_count = 0 while account_count <= number_of_accounts: for account_type in account_types_with_probability: # a person can not have more than 1 BSU account if account_type == 'BSU': if age > 33: break elif bsu_count < 1: bsu_count += 1 else: break elif account_type == 'STUDENT BRUKSKONTO': if age > 25 or age < 19: break random_int = random.randint(0, 100) if random_int <= account_types_with_probability.get(account_type): new_account = Account.generate_random_account_json(person, account_type) accounts.append(new_account) account_count += 1 return accounts parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('people', help='A json file containing people to generate accounts for. ' 'This file is typically output by the create_people.py script') args = parser.parse_args() FileUtil.json_to_json_file(create_list_of_account_json(json.load(open(args.people))), 'generated-accounts')
def create_card_balances(cards): balances = list() for card in cards: #Only generate balance for credit cards if card.card_type == CardType.CREDIT: balances.append(CardBalance.generate_random(card)) return balances if __name__ == "__main__": # Handle CLI arguments parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( 'accounts', help= 'A json file containing accounts to generate cards for. This file is typically output by the create_accounts.py script' ) args = parser.parse_args() # Business time with open(args.accounts, encoding='utf-8') as fh: accounts_file = json.load(fh) cards = create_cards(accounts_file) balances = create_card_balances(cards) cards_json = list(map(lambda c: c.to_json(), cards)) balances_json = list(map(lambda s: s.to_json(), balances)) FileUtil.json_to_json_file(cards_json, 'generated-cards') FileUtil.json_to_json_file(balances_json, 'generated-card-balances')
# this is the Main file from utils.file_util import FileUtil from utils.wordhelper import WordHelper from utils.pinyin_util import * from entity import * if __name__ == '__main__': # todo 首先导入语料文件,建立索引 path = 'testLines.txt' lines = FileUtil.get_file(path) words_line_dict = dict() # 用于存放词与短语对象的对应关系 normal_pinyin_ciyu_dict = dict() # 用于存放全拼拼音与词对象的对应关系 first_pinyin_ciyu_dict = dict() # 用于存放首写拼音与词对象的对应关系 init_pinyin_ciyu_dict = dict() # 用于存放声母与词对象的对应关系 words = [] # 用于存放一行短语的分词结果 for each_line in lines: _full_pinyin = PinyinUtil.get_lazy_pinyin(each_line) newPhrase = Phrase(each_line, _full_pinyin) words = WordHelper.seg(each_line) # todo 分词 for each_word in words: first_pinyin = PinyinUtil.get_first_pinyin(each_word) # 获取首拼 normal_pinyin = PinyinUtil.get_lazy_pinyin(each_word) # 获取全拼 init_pinyin = PinyinUtil.get_init_pinyin(each_word) # 获取声母 new_ciyu = Ciyu(each_word, normal_pinyin, first_pinyin) # 添加word if first_pinyin not in normal_pinyin_ciyu_dict: first_pinyin_ciyu_dict[first_pinyin] = list() first_pinyin_ciyu_dict[first_pinyin].append(each_word)