Пример #1
0
def main(infile,outfile):
    fout=open(outfile,"wb")
    for item in utils.fileLineIter(infile):
        score=random.randint(0,1000000)
        item.append(score)
        fout.write(utils.mergeToLine(item).encode("utf-8"))
    return
Пример #2
0
def main(file_in,
         file_out,
         max_size_percategory,
         category_set,
         truncate=False):
    dict_newslist = {}
    for list in utils.fileLineIter(file_in):
        ####
        url = list[0],
        category = list[1]
        subcat = list[2]
        title = list[3]
        ##
        if category not in category_set:
            if category in {
                    "rt_Canada", "rt_UK", "rt_NewZealand", "rt_Ireland",
                    "rt_Australia", "rt_India", "rt_SouthAfrica"
            }:
                #print(category)
                category = "rt_World"
                list[1] = "rt_World"
                list[2] = ""
            else:
                continue
        if category not in dict_newslist:
            dict_newslist[category] = Heap(max_size_percategory, cmp)
        dict_newslist[category].insert(list)

    # write to file
    f_out = open(file_out, "wb")
    for cat in dict_newslist:
        h = dict_newslist[cat]
        print("[*] %s %d" % (cat, h.size))
        for item in h.enum():
            #item=item[:5]
            # truncate tail and head
            """
            if truncate:
                idx=12*300
                while idx>0 and idx<len(item[4]) and item[4][idx]!=" ":
                    idx-=1
                item[4]=item[4][:idx]
                #
                idx=30
                while idx<len(item[4]) and item[4][idx]!=" ":
                    idx+=1
                item[4]=item[4][idx:]
            """
            f_out.write(utils.mergeToLine(item).encode("utf-8"))
    return
Пример #3
0
def main(infile, outfile, title_idx=3):
    trie = Trie()
    repeat_count = 0
    total = 0
    f_out = open(outfile, "wb")
    s = set()
    for item in utils.filesLineIter(infile):
        total += 1
        title = item[title_idx].strip().lower()
        if title not in s:
            s.add(title)
            f_out.write(utils.mergeToLine(item).encode("utf-8"))
        else:
            repeat_count += 1
    print("[*]repeat News :%s total:%s" % (repeat_count, total))
    return
Пример #4
0
def run(infile, jsonfile, outfile, max_num):
    fin = open("Dataset/" + infile, "rb")
    fout = open("Dataset/" + outfile, "wb")
    keyword = KeyWord(jsonfile)
    cnt = {}
    sum = 0
    for item in utils.fileLineIter(fin):
        url = item[0]
        category = item[1]
        subcategory = item[2]
        title = item[3]
        content = item[4]
        # only category define in json file will be considered
        if not keyword.isCategoryExist(category):
            continue

        join = keyword.getJoinType(category)
        urlCategory = keyword.getUrlCategory(url)
        ###
        remainCurItem = False
        if urlCategory != None:

            if urlCategory == category or (
                    UsePriority and
                (keyword.priorityCompare(urlCategory, category) > 0)):
                remainCurItem = True
        elif join == "outer":
            remainCurItem = True
            urlCategory = category

        if remainCurItem:
            if not urlCategory in cnt:
                cnt[urlCategory] = 0
            if cnt[urlCategory] < max_num:
                if item[1] != urlCategory:
                    item[1] = urlCategory
                    item[2] = ""
                fout.write(utils.mergeToLine(item).encode("utf-8"))
                cnt[urlCategory] += 1
                sum += 1
    print("cnt=%s" % (cnt))
    print("sum=%s" % (sum))
Пример #5
0
def main(prefix_files,news_files_in,news_file_out): 
    if news_file_out[:8]!="Dataset/":
        news_file_out="Dataset/"+news_file_out
    news_out = open(news_file_out,"wb")  
    # (1) Black/White List
    bwList=BWList() 
    delCnt={}
    print("[*]load black/white list success")
    # (2) Preifix tree
    trie=buildTrie(prefix_files)
    print("[*]load prefix files success")
    # run
    cnt=0 
    for list in utils.fileLineIter(news_files_in): 
        url=list[0]
        category=list[1]
        subcat=list[2]
        title=list[3]
        content=list[4]
        if category=="rt_Unclassified":
            continue
        #
        removed =None
        # black list/ min content size
        score = trie.getScore(url,category)
        if  bwList.isInWhiteList(url,category):
            score *= 2
        #elif bwList.isInAllKeyWord(url):
        #    score /= 2  
        # if content is too short ,will be punished
        c_len=len(content)
        if c_len < 600:
            score*=c_len/600; 
        # same score will be choose randomly
        score+=random.randint(10000,99999)/1000000000
        list.append(score)
        #list.append(random.randint(0,10000000))
        news_out.write(utils.mergeToLine(list).encode("utf-8")) 
    return
Пример #6
0
from Trie import Trie
import utils
import sys
from BWList import BWList
import time
import random
if __name__ == "__main__":
    files_in=["Dataset/raw/WithoutQueryJoin_en_2017-08-29_2018-08-28_17_17.txt",
              "Dataset/raw/WithoutQueryJoin_en_2017-08-29_2018-08-28_14_14.txt"]
    file_out="Dataset/raw/AllData"
    #
    fout = open(file_out,"wb")   
    for list in utils.filesLineIter(files_in): 
        fout.write(utils.mergeToLine(list).encode("utf-8"))