Пример #1
0
 def __init__(self,files,silent = False):
     self.f0 = files[0]
     self.f1 = files[1]
     self.silent = silent
     f = open("data_id_post.csv","r")
     self.id_loader = Map_ID_Loader(f, True)
     self.id_dict = self.id_loader.getDict()
     f.close()
Пример #2
0
class CityGraph_header(object):
    def __init__(self,files,silent = False):
        self.f0 = files[0]
        self.f1 = files[1]
        self.silent = silent
        f = open("data_id_post.csv","r")
        self.id_loader = Map_ID_Loader(f, True)
        self.id_dict = self.id_loader.getDict()
        f.close()

    def process(self):
        pdoc = re.compile(r'<doc>.*?</doc>', re.S)            
        docs = pdoc.findall(self.f0.read())                
        pcname = re.compile(r'<cname>.*?</cname>', re.S)        
        phead = re.compile(r'<head>.*?</head>', re.S)        
        pcity = re.compile(r'<NE:CITY.*?>.*?</NE:CITY>', re.S)
        i = 0

        for doc in docs:            
            i += 1
            client = ""
            strcname = pcname.findall(doc)[0]
            strhead = phead.findall(doc)[0]
            cities = pcity.findall(strcname)
            if cities:
                t = cities[0]
                client = t[t.find('>')+1:t.find('<',t.find('<')+1)]
            citycount = {}
            cities = pcity.findall(strhead)
            for items in cities:
                name = items[items.find('>')+1:items.find('<',items.find('<')+1)]
                if citycount.has_key(name):
                    citycount[name] += 1
                else:
                    citycount[name] = 1
            if not client and citycount:
                client = self.find_most(citycount)
            self.output(client, citycount, i)            

    def output(self, client, citycount, ith):
        self.f1.write("--------------------------------------\n")
        self.f1.write("header doc %5d\n" %(ith))
        if not client:
            client = "N/A"
        self.f1.write("client city is %20s\n" %(client))
        for k in citycount.keys():
            self.f1.write("%20s %5s\n"  %(k, citycount[k]))         

    def find_most(self,citycount):
        max = 0
        result = ''
        for k in citycount.keys():
            if citycount[k] > max:
                max = citycount[k]
                result = k
        return result
Пример #3
0
class Header_Matcher(object):
    def __init__(self,files,silent = False):
        self.f1 = files[0]
        self.f2 = files[1]
        self.f3 = files[2]
        self.silent = silent
        f = open("data_id_post.csv","r")
        self.id_loader = Map_ID_Loader(f, True)
        self.id_dict = self.id_loader.getDict()
        f.close()

    def process(self):        
        self.make_annotated_dict()
        print len(self.annotated_dict)
        self.go_through_header_raw()
        print self.num

    def go_through_header_raw(self):        
        s, fid = self.extractDoc(self.f1)
        self.num = 0
        while s != "":                          
            if self.annotated_dict.has_key(fid):
                self.f3.write(s)
                self.num += 1                        
            s, fid = self.extractDoc(self.f1)  

    def make_annotated_dict(self):
        self.annotated_dict = {}
        s, fid = self.extractDoc(self.f2)
        while s != "":  
            self.annotated_dict[fid] = 1                        
            s, fid = self.extractDoc(self.f2)     

    def extractDoc(self, f):
    #extract doc string from input file docly
        c = f.readline()    
        s = ""
        fid = ""
        while c and not "<doc>" in c:
            c = f.readline()        
        if c:        
            c = f.readline()
            tmp = c.replace('<','').replace('>','')
            tmp = tmp.split()[0][:-1]           # <http:url\>:
            if self.id_dict.has_key(tmp):
                fid = self.id_dict[tmp]              
            while not "</doc>" in c:
                s += c
                c = f.readline()                    
        return s, fid
Пример #4
0
class CityGraph(object):
    def __init__(self,files,silent = False):
        self.f0 = files[0]
        self.f1 = files[1]
        self.silent = silent
        f = open("data_id_post.csv","r")
        self.id_loader = Map_ID_Loader(f, True)
        self.id_dict = self.id_loader.getDict()
        f.close()


    def init(self):
    #build city count list on a doc base
        self.docNum = 0
        self.doc_dict = {}
        s, fid = self.extractDoc()        
        while s != "":     
            if fid != "":                                   
                self.doc_dict[fid] = {}
                self.update_dict(s, fid)
            s, fid = self.extractDoc()   

    def update_dict(self, docstr, fid):
    #build the city count list for each doc
        p = re.compile(r'<NE:CITY.*?>.*?</NE:CITY>')
        list = p.findall(docstr)        
        for items in list:
            name = items[items.find('>')+1:items.find('<',items.find('<')+1)]
            name = self.unify(name)
            if self.doc_dict[fid].has_key(name):
                self.doc_dict[fid][name] += 1
            else:
                self.doc_dict[fid][name] = 1

    def unify(self, city):
    #unify a city name into a standard way
        tmp = city.lower().replace(".","").replace(",","")
        tmp = ' '.join(tmp.split())
        return tmp

    def out_city_count_list_comma(self):
        #write the city count list to output file with tab as separator
        if not self.silent:
            #self.f1.write("%20s %20s %5s\n"  %("CITY NAME", "DOC", "COUNT"))         
            for doc_id in self.doc_dict: 
                if self.doc_dict[doc_id]:
                    for each_city in self.doc_dict[doc_id]:
                        self.f1.write("%s,%s,%.0f\n" %(each_city, doc_id, self.doc_dict[doc_id][each_city]))              

    def out_city_count_list(self):
    #write the city count list to output file
        if not self.silent:
            self.f1.write("%20s %20s %5s\n"  %("CITY NAME", "DOC", "COUNT"))         
            for doc_id in self.doc_dict: 
                if self.doc_dict[doc_id]:
                    for each_city in self.doc_dict[doc_id]:
                        self.f1.write("%20s %20s %5.0f\n" %(each_city, doc_id, self.doc_dict[doc_id][each_city]))              

    def extractDoc(self):
    #extract doc string from input file docly
        c = self.f0.readline()    
        s = ""
        fid = ""
        while c and not "<doc>" in c:
            c = self.f0.readline()        
        if c:        
            c = self.f0.readline()
            url = c.split()[0][:-1]
            
            if self.id_dict.has_key(url):                
                fid = self.id_dict[url]    # all http url is directly followed <doc> and has a extra ':' at the end
            while not "</doc>" in c:
                s += c
                c = self.f0.readline()        
            self.docNum += 1
            if self.docNum % 1000 == 0:
                print self.docNum
        return s, fid