def main(infile,outfile): fout=open(outfile,"wb") for item in utils.fileLineIter(infile): score=random.randint(0,1000000) item.append(score) fout.write(utils.mergeToLine(item).encode("utf-8")) return
def main(file_in, file_out, max_size_percategory, category_set, truncate=False): dict_newslist = {} for list in utils.fileLineIter(file_in): #### url = list[0], category = list[1] subcat = list[2] title = list[3] ## if category not in category_set: if category in { "rt_Canada", "rt_UK", "rt_NewZealand", "rt_Ireland", "rt_Australia", "rt_India", "rt_SouthAfrica" }: #print(category) category = "rt_World" list[1] = "rt_World" list[2] = "" else: continue if category not in dict_newslist: dict_newslist[category] = Heap(max_size_percategory, cmp) dict_newslist[category].insert(list) # write to file f_out = open(file_out, "wb") for cat in dict_newslist: h = dict_newslist[cat] print("[*] %s %d" % (cat, h.size)) for item in h.enum(): #item=item[:5] # truncate tail and head """ if truncate: idx=12*300 while idx>0 and idx<len(item[4]) and item[4][idx]!=" ": idx-=1 item[4]=item[4][:idx] # idx=30 while idx<len(item[4]) and item[4][idx]!=" ": idx+=1 item[4]=item[4][idx:] """ f_out.write(utils.mergeToLine(item).encode("utf-8")) return
def main(infile, outfile, title_idx=3): trie = Trie() repeat_count = 0 total = 0 f_out = open(outfile, "wb") s = set() for item in utils.filesLineIter(infile): total += 1 title = item[title_idx].strip().lower() if title not in s: s.add(title) f_out.write(utils.mergeToLine(item).encode("utf-8")) else: repeat_count += 1 print("[*]repeat News :%s total:%s" % (repeat_count, total)) return
def run(infile, jsonfile, outfile, max_num): fin = open("Dataset/" + infile, "rb") fout = open("Dataset/" + outfile, "wb") keyword = KeyWord(jsonfile) cnt = {} sum = 0 for item in utils.fileLineIter(fin): url = item[0] category = item[1] subcategory = item[2] title = item[3] content = item[4] # only category define in json file will be considered if not keyword.isCategoryExist(category): continue join = keyword.getJoinType(category) urlCategory = keyword.getUrlCategory(url) ### remainCurItem = False if urlCategory != None: if urlCategory == category or ( UsePriority and (keyword.priorityCompare(urlCategory, category) > 0)): remainCurItem = True elif join == "outer": remainCurItem = True urlCategory = category if remainCurItem: if not urlCategory in cnt: cnt[urlCategory] = 0 if cnt[urlCategory] < max_num: if item[1] != urlCategory: item[1] = urlCategory item[2] = "" fout.write(utils.mergeToLine(item).encode("utf-8")) cnt[urlCategory] += 1 sum += 1 print("cnt=%s" % (cnt)) print("sum=%s" % (sum))
def main(prefix_files,news_files_in,news_file_out): if news_file_out[:8]!="Dataset/": news_file_out="Dataset/"+news_file_out news_out = open(news_file_out,"wb") # (1) Black/White List bwList=BWList() delCnt={} print("[*]load black/white list success") # (2) Preifix tree trie=buildTrie(prefix_files) print("[*]load prefix files success") # run cnt=0 for list in utils.fileLineIter(news_files_in): url=list[0] category=list[1] subcat=list[2] title=list[3] content=list[4] if category=="rt_Unclassified": continue # removed =None # black list/ min content size score = trie.getScore(url,category) if bwList.isInWhiteList(url,category): score *= 2 #elif bwList.isInAllKeyWord(url): # score /= 2 # if content is too short ,will be punished c_len=len(content) if c_len < 600: score*=c_len/600; # same score will be choose randomly score+=random.randint(10000,99999)/1000000000 list.append(score) #list.append(random.randint(0,10000000)) news_out.write(utils.mergeToLine(list).encode("utf-8")) return
from Trie import Trie import utils import sys from BWList import BWList import time import random if __name__ == "__main__": files_in=["Dataset/raw/WithoutQueryJoin_en_2017-08-29_2018-08-28_17_17.txt", "Dataset/raw/WithoutQueryJoin_en_2017-08-29_2018-08-28_14_14.txt"] file_out="Dataset/raw/AllData" # fout = open(file_out,"wb") for list in utils.filesLineIter(files_in): fout.write(utils.mergeToLine(list).encode("utf-8"))