def words_list(filepath): wordslist = [] with open(filepath, mode = 'r',encoding = 'utf-8') as init_file: for words_rec in init_file.readlines(): Aword = StringSplit.stringsplit(words_rec.strip(),(' ','|')) wordslist.append(Aword[0]) return wordslist
def wordlist(Atext, split_list=' '): #split_list (单词间的分隔符以及需要过滤字符,默认为空格) word_list = {} for Aword in StringSplit.stringsplit(Atext, split_list): lower_word = Aword.lower() if (lower_word not in word_list): word_list[lower_word] = 1 else: word_list[lower_word] = word_list[lower_word] + 1 return word_list
def readWords(filepath): with open(filepath, mode='rb') as read_file: Words_list = [] BOM = read_file.read(3) for word_rec in read_file.readlines(): Aword = StringSplit.stringsplit( word_rec.decode('utf-8').strip(',').strip(), ',') # print(Aword) Aword_NM = {} for word_NM in Aword[2:]: word_no_me = StringSplit.stringsplit(word_NM, '.') # print(word_no_me) try: Aword_NM[word_no_me[0] + '.'] = word_no_me[1] except IndexError: Aword_NM['other nominal'] = word_no_me[0] Words_list.append(EnglishWord(Aword[0], Aword[1], Aword_NM)) return Words_list
from LearnModule import StringSplit import json infilePath = r'F:\memory\python-learning\learning2017\ESP_project\data_news\搜狐新闻语料库.json' outfilePath = r'F:\memory\python-learning\learning2017\ESP_project\data_news\NewsCalsses.json' NewsCalsses = {} TotalCount = 0 with open(infilePath, mode='r', encoding='utf-8') as infile: for aline in infile.readlines(): TotalCount += 1 try: alineJson = json.loads(aline.strip()) except: print('OrderId:%d -- %s' % (TotalCount, aline)) if ('url' in alineJson.keys()): OneCalss = StringSplit.stringsplit( StringSplit.stringsplit(alineJson['url'], '/')[1], '.')[0] if (OneCalss not in NewsCalsses.keys()): NewsCalsses[OneCalss] = alineJson['url'] if (TotalCount % 1000 == 0): print('===== %d =====' % TotalCount) print('=====在 %d 条数据中,类型读取结束 %d 个新闻类型=====' % (TotalCount, len(NewsCalsses))) with open(outfilePath, mode='w', encoding='utf-8') as outfile: for key in NewsCalsses: outfile.write('%s\t%s' % (key, NewsCalsses[key])) outfile.write('\n') # for AClass in NewsCalsses: # print(AClass) #
# Acount = String_func.StrExtractNum(NBACschools[1],1)[0] # print('%d: %d' %(i,Acount)) # i += 1 # NBACcount = NBACcount + Acount for Arec in NBACschools[2:]: if(Arec != '-' and Arec != ''): # print(Arec.strip()) NBACSchoolList.append(Arec.strip()) # print('NBACCount = %d' %NBACcount) # print('Count of NBACShool is %d' %(len(NBACSchoolList))) # ========== 读取高等学校学科创新引智计划高校信息 ========== P111schoolList = [] for Aschool in readCSV(datafilepath4,'utf-8'): P111schools = StringSplit.stringsplit(Aschool[0],(':','、')) # Acount = String_func.StrExtractNum(P111schools[0], 1)[0] # print('%d: %d' %(i,Acount)) # i += 1 # NBACcount = NBACcount + Acount for Arec in P111schools[1:]: if (Arec != '-' and Arec != ''): # print(Arec.strip()) P111schoolList.append(Arec.strip()) # print('NBACCount = %d' %NBACcount) # print('Count of NBACShool is %d' %(len(P111schoolList))) # ========== 读取985和211高校信息 ========== HighSchoolCPinfo = {} for Aschool in readCSV(datafilepath2,'GBK')[2:]:
data = { 'tableNames': 'officialdocs', 'size': 5000, # 'query': 'title:石沉大海', 'field': 'record_type' } conn.request(method="POST", url="http://192.168.10.179:9095/api/MssSearchApi/searchByQuery", headers=headerdata, body=parse.urlencode(data)) # 获取全部分析结果,转成json格式 get_result = json.loads(conn.getresponse().read().decode('utf-8')) result_list = get_result['obj']['hits']['hits'] DocTypeList = [] splitstr = ['〔','第'] for Adata in result_list: # print(Adata['_source']['symbol_of_document_lssuing']) try: AdocType = StringSplit.stringsplit(Adata['_source']['symbol_of_document_lssuing'],splitstr)[0] except: AdocType = '' if(AdocType not in DocTypeList): print(Adata['_source']['symbol_of_document_lssuing']) DocTypeList.append(AdocType) for Adata in DocTypeList: print(Adata)
AfileName = FileName[0] if (ischinese(AfileName)): FileRecord['title'] = AfileName else: FileRecord['title'] = OutFiles[Findex].strip().decode( 'utf-8') + '_' + AfileName if (Findex < len(OutFiles) - 1): Findex += 1 else: Findex = 0 FileRecord['file_suffix'] = FileName[1].strip('.').lower() FileRecord['fileformat'] = FileRecord['file_suffix'] # ===============================设置文件路径相关属性============================== SetFilePath = StringSplit.stringsplit(alineJson['文件路径'], '\\') FilePath = '' for Apath in SetFilePath[1:]: FilePath = os.path.join(FilePath, Apath) FileRecord['fileURL'] = FilePath # ===============================设置文件所属分类============================== FileCat = '' for Apath in SetFilePath[2:-1]: FileCat = FileCat + '//' + Apath FileRecord['FilesCat'] = FileCat # ===============================设置文件其他属性============================== if ('备注' in AfileKey): FileRecord['content'] = alineJson['备注'].strip()
elif (key == '作者'): FileRecord['authors'] = alineJson[key] elif (key == '文件大小'): FileRecord['filesize'] = SetFileSize(int(alineJson[key])) elif (key == '创建时间'): FileRecord['edittime'] = FormatTime(alineJson[key]) elif (key == '修改时间'): FileRecord['pubtime'] = FormatTime(alineJson[key]) FileRecord['belongdep'] = Data_normalization.SetDepartment( deplist) FileRecord['filename'] = FilePathList[fileindex][1] FileName = os.path.splitext(FilePathList[fileindex][1]) # ========================================================= FilePathSet = StringSplit.stringsplit( FilePathList[fileindex][0], '\\') FilePath = '' FilesCat = '' for xpath in FilePathSet[3:]: FilePath = os.path.join(FilePath, xpath) for xpath in FilePathSet[4:]: FilesCat = FilesCat + '//' + xpath FileRecord['fileURL'] = os.path.join( FilePath, FilePathList[fileindex][1]) FileRecord['FilesCat'] = FilesCat FileRecord['title'] = FileName[0] FileRecord['file_suffix'] = FileName[1].strip('.').lower() FileRecord['fileformat'] = SetFileFormat( FileRecord['file_suffix']) FileRecord['viewcounts'] = random.randint(0, 117)
mode='rb') as imagesfile: # OutImagesFiles = imagesfile.readlines() for IFile in ImageFilesName: if (IFile[2] != []): for Afile in IFile[2]: AimageFile['filename'] = Afile AFileName, AFileSuffix = os.path.splitext(Afile) if (ischinese(Afile)): AimageFile['title'] = AFileName else: AimageFile['title'] = OutFiles[Findex].strip().decode( 'utf-8') + '_' + AFileName AimageFile['file_suffix'] = AFileSuffix.strip('.') FilePathSet = StringSplit.stringsplit(IFile[0], '\\') FilePath = '' FilesCat = '' for xpath in FilePathSet[3:]: FilePath = os.path.join(FilePath, xpath) for xpath in FilePathSet[4:]: FilesCat = FilesCat + '//' + xpath AimageFile['fileURL'] = os.path.join(FilePath, Afile) AimageFile['FilesCat'] = FilesCat AimageFile['belongdep'] = SetDepartment(deplist) AimageFile['viewcounts'] = random.randint(0, 163) if (Findex < len(OutFiles) - 1): Findex = Findex + 1 else: Findex = 0
while (True): if (alineStr[0] == '('): while (alineStr[0] != ')'): alineStr = alineStr[1:] alineStr = alineStr[1:] else: break AKLdata['content'] = alineStr elif (key == 'question'): AKLdata['title'] = alineJson[key] elif (key == 'author'): AKLdata['author'] = alineJson[key] elif (key == 'tags'): AKLdata['keywords'] = alineJson[key] elif (key == 'answer_time'): AKLdata['pubtime'] = StringSplit.stringsplit( alineJson[key], 'T')[0] AKLdata['viewcounts'] = random.randint(0, 83) AKLdata['subjectCat'] = Data_normalization.SetCatlog(catlog) AKLdata['belongdep'] = Data_normalization.SetDepartment(deplist) # ============以下为给该条新闻随机分配一个附件============ appendixFilePath = Data_normalization.SetAppendixFiles( appendixfilesPath, n) if (appendixFilePath != ''): n = n + 1 AKLdata['appendix_URL'] = appendixFilePath AKLdata['appendix_filenames'] = [ os.path.basename(x) for x in appendixFilePath ] else:
# -*- coding: utf-8 -*- import random from LearnModule import StringSplit usernamelist = [] with open('F:\documents\python\learning2017\program data\测试用人员名单.dat', mode='rb') as outfile: outfile.readline() for aline in outfile.readlines(): aline_decode = aline.decode('utf-8').strip() Auser = StringSplit.stringsplit(aline_decode, '\t') # print(Auser) inserttag = 1 try: username = Auser[3] except IndexError as e: print('%s-->第%s条有问题' % (e, Auser[0])) inserttag = 0 except TypeError as e: print('有问题') inserttag = 0 if (inserttag): usernamelist.append(username) usernameset = set() usernamelistlen = len(usernamelist) while (len(usernameset) < 121): usernameset.add(usernamelist[random.randint(0, usernamelistlen - 1)])
CurrentTable = PokerTable(initTableParameter(),[''] * 4, [''] * 4,menutext) CurrentTable.ShowTable() while (not ifVictory(CurrentTable.GamePokersList)): GameOperating = input('请按上述菜单输入操作指令') if(GameOperating.lower() == 'q'): print('游戏结束 ! ') break elif(GameOperating.lower() == 'r'): print('开始一局新游戏 !') CurrentTable = PokerTable(initTableParameter(), [''] * 4, [''] * 4, menutext) CurrentTable.ShowTable() else: Operation = StringSplit.stringsplit(GameOperating,('(',')',',')) # print(Operation) if(Operation[0] == '1'): #输入1(列号A,列号B,牌数N),表示将游戏区A列最下面的一组牌N张移动到游戏区B列最下面,N不输则为1张 ExcuteTag = 1 if(len(Operation) == 3): MoveCount = 1 elif(len(Operation) == 4): MoveCount = Operation[3] else: print('请按上述菜单要求输入正确的操作指令') ExcuteTag = 0 if(ExcuteTag): if(Game2Game(CurrentTable.GamePokersList,CurrentTable.TempPokersList,Operation[1],Operation[2],MoveCount)): CurrentTable.ShowTable()
def time2seconds(time): h, m, s = StringSplit.stringsplit(time, ':') #用分隔符“:”拆分成时、分、秒 return int(h) * 3600 + int(m) * 60 + int(s)
# -*- coding: utf-8 -*- from LearnModule import StringSplit KLcatlist = [] with open('F:/documents/python/learning2017/ESP_project/data_knowledge/学科分类11.dat', mode = 'rb') as infile: for aline in infile.readlines(): aline_decode = aline.decode('utf-8').strip() if(aline_decode != ''): Arecord = StringSplit.stringsplit(aline_decode,' ') # print(Arecord) if(len(Arecord[0]) == 2): Acatlvl1 = Arecord[1] if(len(Arecord[0]) == 4): Acatlvl2 = Arecord[1] if(len(Arecord[0]) == 6): Acatlvl3 = Arecord[1] Acatlog = Acatlvl1 + '\\' + Acatlvl2 + '\\' + Acatlvl3 KLcatlist.append(Acatlog) with open('F:/documents/python/learning2017/ESP_project/data_knowledge/学科分类22.dat', mode = 'wb') as outfile: for arecord in KLcatlist: outfile.write(arecord.encode('utf-8')) outfile.write('\n'.encode('utf-8'))
# ClassNO_Split = [] test_txt_temp = [] # ZT_Class_dict = {} with open('C:/Users/flyingaura/Desktop/test.txt', 'rb') as init_file: for fline in init_file.readlines(): # fline.strip() # print(fline) fline_decode = fline.decode('utf-8') # print(fline_decode) if (fline_decode.strip() != ''): test_txt_temp.append(fline_decode.strip()) # 对从文件中读出来的数据做处理 test_txt_temp.pop(0) for key in test_txt_temp: fline_split.append(StringSplit.stringsplit(key, '\t')) for Kstring in fline_split: if (len(Kstring) == 1): ZTNo_List.append(ZTNo_source(Kstring[0], '', '')) elif (len(Kstring) == 2): for ZTkey in StringSplit.stringsplit(Kstring[1], ';'): ZTNo_List.append(ZTNo_source(Kstring[0], ZTkey, '')) else: ZTNo_temp = StringSplit.stringsplit(Kstring[1], ';') ZTNo_name = StringSplit.stringsplit(Kstring[2], ';') for i in range(len(ZTNo_temp)): if (i < len(ZTNo_name)): ZTNo_List.append( ZTNo_source(Kstring[0], ZTNo_temp[i], ZTNo_name[i])) else: ZTNo_List.append(ZTNo_source(Kstring[0], ZTNo_temp[i], ''))
# -*-coding: utf-8 -*- # 将分析出来的新闻类型转化为中文 from LearnModule import StringSplit infilepath = r'F:\memory\python-learning\learning2017\ESP_project\data_news\NewsCalsses.json' RestClasses = [] with open(infilepath, mode='r', encoding='utf-8') as infile: for aline in infile.readlines(): if ('auto' in StringSplit.stringsplit(aline, '\t')[1]): continue else: print(aline) RestClasses.append(aline) # print(len(RestClasses))
alias_street_list.append(self.street[:filter_index]) if (alias_street_list): return alias_street_list else: return None toponymy_list = [] the_towns = '-' with open('C:/Users/flyingaura/Desktop/昌平区.dat', mode='rb') as in_file: the_province = '北京市' the_city = in_file.readline().decode('utf-8').strip() for rec in in_file.readlines(): rec_data = rec.decode('utf-8').strip() if ('\t' in rec_data): rec_TS = StringSplit.stringsplit(rec_data, '\t') the_towns = rec_TS[0] rec_street = rec_TS[1].strip('\"') else: rec_street = rec_data.strip('\"') if (rec_street): for the_street in StringSplit.stringsplit(rec_street, '、'): rec_toponymy = toponymy(the_province, the_city, the_towns, the_street) toponymy_list.append(rec_toponymy) # for rec_ty in toponymy_list: # print('============ %s' %rec_ty.street) province_list = []
# -*- coding: utf-8 -*- from LearnModule import StringSplit with open('C:/Users/flyingaura/Desktop/test.txt', mode='rb') as wordsfile: wordsfile.readline() # print(wordsfile.readline().decode('utf-8')) all_words = [] for line in wordsfile.readlines(): for word in StringSplit.stringsplit( line.decode('utf-8').strip(), (',', ' ', '.', '-')): all_words.append(word.lower()) print('The count of Speech Words is: %d' % len(all_words)) print(list(all_words)) unique_words = {} for aword in all_words: if (aword not in unique_words.keys()): unique_words[aword] = all_words.count(aword) print('the count of unique words in this speech is : %d ' % len(unique_words)) for keys in sorted(unique_words, key=lambda x: unique_words[x], reverse=True): print('%s : %d ' % (keys, unique_words[keys]))
if(tagList[i] < Sep_tagList[i]): tag_name = 'tag' + chr(ord('A') + i) BadDiag_wrong[tag_name] = BadDiag_wrong[tag_name] + 1 return (GoodDiag_wrong,BadDiag_wrong) # ================ 主程序 ===================== AllData = [] # 从文件中读取所有学习数据 with open('C:/Users/flyingaura/Desktop/breast-cancer-wisconsin.data', mode = 'rb') as Datafile: for Aline in Datafile.readlines(): Adata = [] save_tag = 1 Aline_decode = Aline.decode('utf-8').strip() for x in StringSplit.stringsplit(Aline_decode,','): try: Adata.append(int(x)) except ValueError: save_tag = 0 break if(save_tag): AllData.append(breast_data(Adata[0],Adata[1],Adata[2],Adata[3],Adata[4], Adata[5], Adata[6],Adata[7],Adata[8],Adata[9],Adata[10])) # 随机选取400组做为训练语料,剩下的做为测试语料 ML_indexList = set() ML_count = 400 DataCount = len(AllData) print(DataCount) if(DataCount < 400):
def get_duty(self): return self.duty def get_mobilephone(self): return self.mobilephone def get_email(self): return self.email # --------从文件中读所有人名--------- person_list = [] with open('C:/Users/flyingaura/Desktop/Chinese_surname.txt','rb') as person_file: # 按行读取每个人的基本信息,并保存到person_list列表里 for one in person_file.readlines(): OnePerson = StringSplit.stringsplit(one.strip().decode('utf-8'), '\t') Pinfo_temp = ['','','',''] for i in range(len(OnePerson)): Pinfo_temp[i] = OnePerson[i] personinfo = person_info(Pinfo_temp[0],Pinfo_temp[1],Pinfo_temp[2],Pinfo_temp[3]) person_list.append(personinfo) person_list.pop(0) # 开始分析 sur_list = [] # 把所有的姓都取出来! for person_key in person_list: sur_list.append(get_surname(person_key.get_name())) # print(sur_list) # 对姓进行统计 index_sur = []
# ================ 主程序 ===================== # AllData = [] breastPatientList = [] # 从文件中读取所有学习数据 with open('G:/memory/python-learning/learning2017/program data/breast-cancer-wisconsin.data', mode = 'rb') as Datafile: for Aline in Datafile.readlines(): # breastAttrList = [] # save_tag = 1 Aline_decode = Aline.decode('utf-8').strip() APatientData = StringSplit.stringsplit(Aline_decode,',') try: breastAttrList = [[int(x),'',0] for x in APatientData[1:-1]] except ValueError as e: print('ValueError --> 参数错误,该参数无法转换为数值') continue ABpatient = breast_patients(0,[],'') ABpatient.patientID = APatientData[0] ABpatient.breastAttrList = breastAttrList if(APatientData[-1] == '2'): ABpatient.diagResult = 'Good' elif(APatientData[-1] == '4'): ABpatient.diagResult = 'Bad' else: ABpatient.diagResult = 'UnKnown'
# 对模块的使用测试 # -*- coding: utf-8 -*- import sys import math import functools from LearnModule import Module_test from LearnModule import StringSplit from LearnModule import String_func print('the module\'s note is :', Module_test.__doc__) print('the module\'s author is :', Module_test.__author__) print('the module\' edit time is : ', StringSplit.__edittime__) # Module_test.test() StrList = 'aaa,acb.923asd,88sdfffdd.,sdfk__sldf,1sdfjjj,asiw2,ddd,AFD' # 先进行字符串拆分 StrNList = StringSplit.stringsplit(StrList, (',', 's')) # 给每个字符串加上分隔符 for StrN in StrNList: print(String_func.StringAddVal(StrN, '***'))
# -*- coding: utf-8 -*- # 整理公司通讯录,生成完整的组织机构 from LearnModule import StringSplit OrgList = [] with open('C:/Users/flyingaura/Desktop/公司通讯录数据(开发用).dat', mode='rb') as outfile: for aline in outfile.readlines(): aline_decode = aline.decode('utf-8') OrgList.append(StringSplit.stringsplit(aline_decode.strip(), '\t')) # print(OrgList) OrgNameList = [] OrgTemp = [] for unit in OrgList: if (unit == None): unit = [] OrgName = '' if (len(unit) > 1): for alevel in unit: OrgName = OrgName + '\\' + alevel OrgNameList.append(OrgName) OrgTemp = unit[:-1] elif (len(unit) == 1): if (OrgTemp == []): OrgName = OrgName + '\\' + unit[0] else: for alevel in OrgTemp: # print(alevel) OrgName = OrgName + '\\' + alevel
n = 0 #分配附件用的起始值 with open( 'F:/documents/python/learning2017/ESP_project/data_news/origin_news.json', mode='rb') as infile: #origin_news StripStr = '安卓网 > \t\t\t\t科技频道 \t\t\t\t\t\t\t\t\t \t\t\t\t\t\t\t\t\t > 互联\t\t\t\t\t\t\t \t\t <!-- //头部 ---> \t \t\t\t \t\t\t\t \t\t\t\t\t' for Aline in infile.readlines(): Aline_decode = Aline.decode('utf-8').strip() AlineJson = json.loads(Aline_decode) # print(AlineJson) for key in AlineJson: # print(key) if (key == '标题'): AnormalData['title'] = AlineJson[key] elif (key == '正文内容'): try: AnormalData['content'] = StringSplit.stringsplit( AlineJson[key].strip(StripStr), '\t')[-1] except TypeError as e: print(AlineJson[key].strip(StripStr)) elif (key == '链接'): AnormalData['Original_address'] = AlineJson[key] AnormalData['pubtime'] = SetPubtime(20150101, '00:00:00', random.randint(0, 730), random.randint(0, 24 * 3600)) AnormalData['belongdep'] = SetDepartment(deplist) AnormalData['viewcounts'] = random.randint(0, 76) AnormalData['source'] = SetSource(sourcelist) AnormalData['NewsCat'] = SetCatlog(catlog) #============以下为给该条新闻随机分配一个附件============
# 打开梭哈扑克牌面文件 showhand_pokers.dat with open('C:/Users/flyingauraHome/Desktop/showhand_pokers.dat', mode='r') as showhandfile: showhandfile.readline() #===========================开始计算各种牌面类型=========================== start_time = time.time() print( '============== Showhand Pokers Aggregations by Ranks! Now Starting Calculation @ %s ==============' % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))) showhand_count = 0 for line in showhandfile.readlines( ): #每一行:[A(1),Heart] [A(1),Spade] [A(1),Diamond] [A(1),Club] [2(2),Heart] error_tag = 0 onehand = [] for mpoker in StringSplit.stringsplit( line.strip(), ('[', ']', '\t')): #分解成5个独立的poker牌字符串:A(1),Heart ppoker = StringSplit.stringsplit( mpoker, ('(', ')', ',')) #每个poker牌字符串分解成poker的三种属性:A,1,Heart try: onehand.append(Apoker(int(ppoker[1]), ppoker[2])) except ValueError as e: print('ValueError:%s' % e) error_tag = 1 # showhand_count = showhand_count - 1 # for onepoker in onehand: # print('[%s(%d),%s]' % (onepoker.Value_show(), onepoker.Pvalue, onepoker.Pflower), end='\t') # print('\n') if (error_tag): continue straight_tag = 0 pairs_poker = count_pairs(onehand)
def FormatTime(xtime): Ftime = StringSplit.stringsplit(xtime, ('T', 'Z')) return Ftime[0] + ' ' + Ftime[1]
def readCSV(filepath, reqcode='utf-8'): with open(filepath, mode='r', encoding=reqcode) as infile: datalist = [] for aline in csv.reader(infile): datalist.append(aline) return datalist DataDistrList = [] for aline in readCSV(filepath): DataRec = {} if (not aline[0].isdigit()): DataDistr = StringSplit.stringsplit(aline[0], ('(', ')')) try: DataRec[DataDistr[0]] = String_func.StrExtractNum(DataDistr[1], 1)[0] except IndexError: continue DataDistrList.append(DataRec) with open(r'F:\documents\python\learning2017\program data\高校相关数据\高校按省分布.dat', mode='w', encoding='utf-8') as outfile: for Arec in DataDistrList: for key in Arec: outfile.write('%s\t%d' % (key, Arec[key])) outfile.write('\n')