Пример #1
0
def svm_classify(train_set, train_tag, test_set, test_tag):
    """
    首先实现一个 linear kernel 的分类器
    """
    clf = svm.LinearSVC()
    clf_res = clf.fit(train_set, train_tag)
    train_pred = clf_res.predict(train_set)
    test_pred = clf_res.predict(test_set)

    train_err_num, train_err_ratio = FileUtil.checked_pred(
        train_tag, train_pred)
    test_err_num, test_err_ratio = FileUtil.checked_pred(test_tag, test_pred)

    print('=== 分类训练完毕,分类结果如下 ===')
    print('训练集误差: {e}'.format(e=train_err_ratio))
    print('检验集误差: {e}'.format(e=test_err_ratio))
    return clf_res
Пример #2
0
def create_account_detail_and_account_balance_files(accounts):
    account_details = list()
    account_balances = list()

    for account in accounts:
        account_number = account['accountNumber']
        interest_rate = interest_rates.get(account['productName'])
        return_list = get_balance_and_interest_details(account, interest_rate)
        balance = return_list[0]

        account_detail = AccountDetails.generate_random_account_detail_json(account, return_list)
        account_balance = AccountBalance.generate_random_account_balance_json(account_number, balance)

        account_details.append(account_detail)
        account_balances.append(account_balance)

    FileUtil.json_to_json_file(account_details, 'generated-account-details')
    FileUtil.json_to_json_file(account_balances, 'generated-account-balances')
Пример #3
0
 def make_date_value_dic(self) -> dict:
     labels = self.config.get_labels()
     data_dic = {}
     for label in labels:
         data_dic[label] = {}
     file_paths = FileUtil.get_recursive_file_paths(
         self.config.get_base_dir())
     file_paths.sort(key=lambda x: x.split("/")[-1])
     for file_path in file_paths:
         lines = FileUtil.read_lines(file_path)
         for line in lines:
             for label in labels:
                 if label not in line:
                     continue
                 value = HabitReporter.extract_value_float(line)
                 if value is None:
                     continue
                 data_dic[label][DateUtil.extract_date(file_path)] = value
     return data_dic
Пример #4
0
def convert_doc_to_wordlist(str_doc, cut_all):
    """
    将文件内容转换成词典
    """
    sent_list = str_doc.split('\n')
    # 移除一些字符, 如\u3000 空格
    sent_list = map(FileUtil.rm_char, sent_list)
    # 分词,并且去掉停用词
    word_2dlist = [
        FileUtil.rm_tokens(jieba.cut(part, cut_all=cut_all))
        for part in sent_list
    ]
    word_list = sum(word_2dlist, [])
    return word_list
Пример #5
0
def get_word_2_pinyin_dict():
    path = 'testLines.txt'
    lines = FileUtil.get_file(path)
    # words_line_dict = dict()  # 用于存放词与短语对象的对应关系
    _normal_pinyin_phrase_dict = dict()  # 用于存放全拼拼音与短语对象的对应关系
    _first_pinyin_phrase_dict = dict()  # 用于存放首写拼音与短语对象的对应关系
    _init_pinyin_phrase_dict = dict()  # 用于存放声母与短语对象的对应关系
    words = []  # 用于存放一行短语的分词结果
    for each_line in lines:
        _full_pinyin = ''.join(PinyinUtil.get_lazy_pinyin(each_line))
        _init_pinyin = ''.join(PinyinUtil.get_init_pinyin(each_line))
        _first_pin = ''.join(PinyinUtil.get_first_pinyin(each_line))
        newPhrase = Phrase(each_line, _full_pinyin)

        append_new_val_in_dict(_normal_pinyin_phrase_dict, _full_pinyin,
                               newPhrase)
        append_new_val_in_dict(_first_pinyin_phrase_dict, _first_pin,
                               newPhrase)
        append_new_val_in_dict(_init_pinyin_phrase_dict, _init_pinyin,
                               newPhrase)
    return _normal_pinyin_phrase_dict, _first_pinyin_phrase_dict, _init_pinyin_phrase_dict
Пример #6
0
                    if age > 33:
                        break
                    elif bsu_count < 1:
                        bsu_count += 1
                    else:
                        break
                elif account_type == 'STUDENT BRUKSKONTO':
                    if age > 25 or age < 19:
                        break
                random_int = random.randint(0, 100)
                if random_int <= account_types_with_probability.get(
                        account_type):
                    new_account = Account.generate_random_account_json(
                        person, account_type)
                    accounts.append(new_account)
                    account_count += 1
    return accounts


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        'people',
        help='A json file containing people to generate accounts for. '
        'This file is typically output by the create_people.py script')
    args = parser.parse_args()

    with open(args.people, encoding='utf-8') as fh:
        FileUtil.json_to_json_file(create_list_of_account_json(json.load(fh)),
                                   'generated-accounts')
Пример #7
0
#!/usr/bin/env python3
import argparse

from models.person.person import Person
from utils.file_util import FileUtil


def create_list_of_people_json(number_of_people):
    people = list()
    for i in range(number_of_people):
        random_person = Person.generate_random()
        people.append(random_person.to_json())
    return people


parser = argparse.ArgumentParser(
    description="Quick hack to generate fake people and some data.")
parser.add_argument('-n',
                    type=int,
                    default=10,
                    help='The number of people to create')
args = parser.parse_args()

FileUtil.json_to_json_file(create_list_of_people_json(args.n),
                           'generated-people')
Пример #8
0
from create_cards_and_card_balances import create_card_balances

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Generate people along with associated account data.")
    parser.add_argument('-n',
                        type=int,
                        default=10,
                        help='The number of people to create')
    args = parser.parse_args()

    # Create a list of people to use
    # 'generated-people' json file created
    people = create_list_of_people_json(args.n)
    print('\nCreated people\n')
    FileUtil.json_to_json_file(people, 'generated-people')

    # Create between 1 and 6 accounts for each person
    # 'generated-accounts' json file created
    accounts = create_list_of_account_json(people)
    print('\nCreated accounts\n')
    FileUtil.json_to_json_file(accounts, 'generated-accounts')

    # Create details for each of the accounts
    # 'generated-account-details' and 'generated-account-balances' json files created
    balances = create_account_detail_and_account_balance_files(accounts)
    print('\nCreated account details\n')
    FileUtil.json_to_json_file(balances, 'generated-card-balances')

    # Add fake payments for each of the accounts
    # 'generated-booked-transactions', 'generated-due-payments' and 'generated-reserved-transactions' json files created
Пример #9
0
    lsi_model = None
    predictor = None
    if not os.path.exists(tmp_data_path):
        os.makedirs(tmp_data_path)

    # n 表示抽样率,n 抽 1
    n = 10

    # # ===================================================================
    # # # # 第一阶段,  遍历文档,生成词典,并去掉频率较少的项
    #       如果指定的位置没有词典,则重新生成一个。如果有,则跳过该阶段
    t0 = time.time()
    if not os.path.exists(path_dictionary):
        print('=== 未检测到有词典存在,开始遍历生成词典 ===')
        dictionary = corpora.Dictionary()
        files = FileUtil.read_file_by_dir(data_path)

        for i, content in enumerate(files):
            if i % n == 0:
                catg = content[0]
                file = content[1]
                file = convert_doc_to_wordlist(file, False)
                dictionary.add_documents([file])
                if int(i / n) % 1000 == 0:
                    print('{t} *** {i} \t docs has been dealed'.format(
                        i=i,
                        t=time.strftime('%Y-%m-%d %H:%M:%S',
                                        time.localtime())))
        # 去掉词典中出现次数过少的
        small_freq_ids = [
            tokenid for tokenid, docfreq in dictionary.dfs.items()
Пример #10
0
#!/usr/bin/env python
# coding=UTF-8
'''
 # Desc:
 # Author:TavisD 
 # Time:2016-10-10 10:53
 # Ver:V1.0
'''

from utils.api_client import APIClient
from utils.file_util import FileUtil
from utils.mysql_util import MysqlUtil
from utils.gen_util import GenUtil

api_client = APIClient()
file_util = FileUtil()
mysql_util = MysqlUtil()
gen_util = GenUtil()
Пример #11
0
        account_count = 0
        bsu_count = 0
        while account_count <= number_of_accounts:
            for account_type in account_types_with_probability:
                # a person can not have more than 1 BSU account
                if account_type == 'BSU':
                    if age > 33:
                        break
                    elif bsu_count < 1:
                        bsu_count += 1
                    else:
                        break
                elif account_type == 'STUDENT BRUKSKONTO':
                    if age > 25 or age < 19:
                        break
                random_int = random.randint(0, 100)
                if random_int <= account_types_with_probability.get(account_type):
                    new_account = Account.generate_random_account_json(person, account_type)
                    accounts.append(new_account)
                    account_count += 1
    return accounts


parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('people',
                    help='A json file containing people to generate accounts for. '
                         'This file is typically output by the create_people.py script')
args = parser.parse_args()

FileUtil.json_to_json_file(create_list_of_account_json(json.load(open(args.people))), 'generated-accounts')
Пример #12
0
def create_card_balances(cards):
    balances = list()
    for card in cards:
        #Only generate balance for credit cards
        if card.card_type == CardType.CREDIT:
            balances.append(CardBalance.generate_random(card))
    return balances


if __name__ == "__main__":
    # Handle CLI arguments
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        'accounts',
        help=
        'A json file containing accounts to generate cards for. This file is typically output by the create_accounts.py script'
    )
    args = parser.parse_args()

    # Business time
    with open(args.accounts, encoding='utf-8') as fh:
        accounts_file = json.load(fh)
        cards = create_cards(accounts_file)
        balances = create_card_balances(cards)
        cards_json = list(map(lambda c: c.to_json(), cards))
        balances_json = list(map(lambda s: s.to_json(), balances))

        FileUtil.json_to_json_file(cards_json, 'generated-cards')
        FileUtil.json_to_json_file(balances_json, 'generated-card-balances')
Пример #13
0
# this is the Main file
from utils.file_util import FileUtil
from utils.wordhelper import WordHelper
from utils.pinyin_util import *
from entity import *

if __name__ == '__main__':
    # todo 首先导入语料文件,建立索引
    path = 'testLines.txt'
    lines = FileUtil.get_file(path)
    words_line_dict = dict()  # 用于存放词与短语对象的对应关系
    normal_pinyin_ciyu_dict = dict()  # 用于存放全拼拼音与词对象的对应关系
    first_pinyin_ciyu_dict = dict()  # 用于存放首写拼音与词对象的对应关系
    init_pinyin_ciyu_dict = dict()  # 用于存放声母与词对象的对应关系

    words = []  # 用于存放一行短语的分词结果

    for each_line in lines:
        _full_pinyin = PinyinUtil.get_lazy_pinyin(each_line)
        newPhrase = Phrase(each_line, _full_pinyin)
        words = WordHelper.seg(each_line)  # todo 分词
        for each_word in words:
            first_pinyin = PinyinUtil.get_first_pinyin(each_word)  # 获取首拼
            normal_pinyin = PinyinUtil.get_lazy_pinyin(each_word)  # 获取全拼
            init_pinyin = PinyinUtil.get_init_pinyin(each_word)  # 获取声母
            new_ciyu = Ciyu(each_word, normal_pinyin, first_pinyin)

            # 添加word
            if first_pinyin not in normal_pinyin_ciyu_dict:
                first_pinyin_ciyu_dict[first_pinyin] = list()
            first_pinyin_ciyu_dict[first_pinyin].append(each_word)