Exemplo n.º 1
0
 def do_load(self,line):
     ''' Loads all texts in a directory.'''
     log('Loading texts and jobs. Might take a minute.')
     del self.texts[:]
     del self.jobs[:]
     self.loadAllJobs()
     self.loadAllTexts()
Exemplo n.º 2
0
 def run_profile(self, focal, comparison, stopwords, delimiters, maxcost):
     log('Running profile: ' + focal + '_' + comparison + '_' + stopwords +
         '_' + delimiters)
     for t in self.texts:
         t.generateProfile(self.getClass(focal), self.getClass(comparison),
                           self.getClass(stopwords),
                           self.getClass(delimiters), maxcost)
Exemplo n.º 3
0
 def nodify(self):
     ignores = self.parsehandler.ignore.chars
     classes = self.parsehandler.classes
     file = codecs.open(self.path,encoding='utf-8')
     log("Generating nodes for " + self.id)
     # Position in file
     pos = 0
     node_handler = NodeHandler(self.parsehandler)
     # Disable garbage collector while looping
     gc.disable()
     while True:
         # we read the entire file one character at at time
         s = file.read(1)
         current = s
         # If not current, we've hit the end of the file
         if not current:
             break
         # only continue (i.e., increment the count, etc)
         # if we aren't ignoring this character
         if (current in ignores):
             node_handler.clearQueue()
             continue
         pos = pos + 1
         for cc in classes:
             for set in cc.chars:
                 for char in set:
                     if current == char:
                         node_handler.add(Node(current,cc,pos,set))
     node_handler.clearQueue()
     self.nodes = node_handler.nodes[:]
     file.close()
     self.charnum = pos
     gc.enable()
     log("\tThere were " + str(self.charnum) +
           " characters, and " + str(len(self.nodes)) + " nodes.")
Exemplo n.º 4
0
 def do_load(self,line):
     ''' Loads all texts in a directory.'''
     log('Loading texts and jobs. Might take a minute.')
     del self.texts[:]
     del self.jobs[:]
     self.loadAllJobs()
     self.loadAllTexts()
Exemplo n.º 5
0
 def loadAllTexts(self):
     for file in os.listdir(self.dirpath):
         if file[0] != '.':
             log('Loading ' + file + '.')
             self.loadText(self.dirpath + '/' + file)
     for text in self.texts:
         text.nodify()
Exemplo n.º 6
0
 def nodify(self):
     ignores = self.parsehandler.ignore.chars
     classes = self.parsehandler.classes
     file = codecs.open(self.path, encoding='utf-8')
     log("Generating nodes for " + self.id)
     # Position in file
     pos = 0
     node_handler = NodeHandler(self.parsehandler)
     # Disable garbage collector while looping
     gc.disable()
     while True:
         # we read the entire file one character at at time
         s = file.read(1)
         current = s
         # If not current, we've hit the end of the file
         if not current:
             break
         # only continue (i.e., increment the count, etc)
         # if we aren't ignoring this character
         if (current in ignores):
             node_handler.clearQueue()
             continue
         pos = pos + 1
         for cc in classes:
             for set in cc.chars:
                 for char in set:
                     if current == char:
                         node_handler.add(Node(current, cc, pos, set))
     node_handler.clearQueue()
     self.nodes = node_handler.nodes[:]
     file.close()
     self.charnum = pos
     gc.enable()
     log("\tThere were " + str(self.charnum) + " characters, and " +
         str(len(self.nodes)) + " nodes.")
Exemplo n.º 7
0
 def loadAllTexts(self):
     for file in os.listdir(self.dirpath):
         if file[0] != '.':
             log('Loading ' + file + '.')
             self.loadText(self.dirpath + '/' + file)
     for text in self.texts:
         text.nodify()
Exemplo n.º 8
0
 def do_jobs(self, line):
     '''Quick command for do all in job_batch function.'''
     del self.jobs[:]
     self.loadAllJobs()
     for j in self.jobs:
         self.run_profile(j[0], j[1], j[2], j[3], j[4])
     log('Done running job batch.')
Exemplo n.º 9
0
 def run_profile(self,focal,comparison,stopwords,delimiters,maxcost):
     log('Running profile: ' + focal + '_' + comparison + '_' + stopwords + '_' + delimiters)
     for t in self.texts:
         t.generateProfile(self.getClass(focal),
                           self.getClass(comparison),
                           self.getClass(stopwords),
                           self.getClass(delimiters),
                           maxcost)
Exemplo n.º 10
0
 def getColocations(self, abscost):
     colocations = []
     log("Compare class was : " + self.compare.id)
     for f in self.focals[:]:
         for e in f.edges:
             if e.cc == self.compare.id and abs(e.cost) <= abscost:
                 colocations.append(f)
     return colocations
Exemplo n.º 11
0
 def getColocations(self,abscost):
     colocations = []
     log("Compare class was : " + self.compare.id)
     for f in self.focals[:]:
         for e in f.edges:
             if e.cc == self.compare.id and abs(e.cost) <= abscost:
                 colocations.append(f)
     return colocations
Exemplo n.º 12
0
 def printProfile(self):
     log("\n#### Profile ####")
     log("Focals: " + self.focal.id)
     log("Compares: " + self.compare.id)
     for f in self.focals:
         log("\n")
         f.printNode()
Exemplo n.º 13
0
 def printProfile(self):
     log("\n#### Profile ####")
     log("Focals: " + self.focal.id)
     log("Compares: " + self.compare.id)
     for f in self.focals:
         log("\n")
         f.printNode()
Exemplo n.º 14
0
 def countInSentence(self):
     log("Started counting in sentence for " + self.id)
     count = 0
     for f in self.focals:
         edges = f.edges
         f_pos = f.pos
         closest = self.getClosestTwoDelimiterPositions(f_pos, edges)
         left = closest[0]
         right = closest[1]
         for e in edges:
             pos = e.pos
             if (e.cc == self.compare.id
                     and ((pos > left and pos < f_pos) or
                          (pos < right and pos > f_pos))):
                 count = count + 1
     return count
Exemplo n.º 15
0
 def countInSentence(self):
     log("Started counting in sentence for " + self.id)
     count = 0
     for f in self.focals:
         edges = f.edges
         f_pos = f.pos
         closest = self.getClosestTwoDelimiterPositions(f_pos,edges)
         left = closest[0]
         right = closest[1]
         for e in edges:
             pos = e.pos
             if (e.cc == self.compare.id and
               ((pos > left and pos < f_pos) or
                (pos < right and pos > f_pos))):
                 count = count + 1
     return count
Exemplo n.º 16
0
 def printProfile(self):
     log("\n#### Profile ####")
     log("Focals: " + self.focal.id)
     log("Compares: " + self.compare.id)
     for k, v in self.edge_count.items():
         print('Edge count (' + str(k) + '): ' + str(v))
     for k, v in self.contingency.items():
         print('Contingency table (' + str(k) + '): ' + str(v))
Exemplo n.º 17
0
 def generateProfile(self, focal, compare, stopword, delim, maxcost=120):
     focals = []
     stopwords = []
     delims = []
     compares = []
     new_list = []
     for n in self.nodes:
         x = Node(n.char, n.cc, n.pos, n.key)
         new_list.append(x)
     log(len(new_list))
     # Sort the nodes into their correct categories
     for n in new_list:
         #        for n in self.nodes[:]:
         if n.cc == focal:
             focals.append(n)
         elif n.cc == stopword:
             stopwords.append(n)
         elif n.cc == delim:
             delims.append(n)
         elif n.cc == compare:
             compares.append(n)
     p = NodeProfile(focals, stopwords, delims, compares, focal, compare,
                     stopword, delim, maxcost)
     self.profiles.append(p)
Exemplo n.º 18
0
 def printEdge(self):
     log("\t\t#### Edge ####")
     log("\t\tClass: " + self.cc)
     log("\t\tId: " + self.id)
     log("\t\tCost: " + str(self.cost))
     log("\t\tAbsolute cost: " + str(abs(self.cost)) + "\n")
Exemplo n.º 19
0
 def printNode(self):
     log("\t#### Node ####")
     log("\tClass: " + self.cc.id)
     log("\tKey: " + self.key)
     for e in self.edges:
         e.printEdge()
Exemplo n.º 20
0
 def getClass(self, id):
     for c in self.classes:
         if c.id == id:
             return c
     log('No class found for ' + id)
     return None
Exemplo n.º 21
0
    def generateEdges(self):
        log("Generating edges for " + self.id)
        max = self.maxcost
        neg_max = (-1 * max)
        edge_count = 0
        # The list position of the first found element so we don't need to
        # keep checking the beginning of the list when abs(cost) > maxcost
        first_stop = 0
        first_delim = 0
        first_compare = 0
        # Optimizations tricks
        stopword = self.stopword.id
        delim = self.delim.id
        for f in self.focals:
            gc.disable()
            f_pos = f.pos
            # For each newly-minted focal node, determine the distance to
            # each stopword if the node is within maxcost absolute distance.
            # This is because we don't want to count these words towards the
            # distance of future nodes.
            found_first_stop = False
            stop_index = first_stop
            for s in self.stopwords[first_stop:]:
                s_cost = f_pos - s.pos
                # Stop at upper searching bound
                if s_cost < neg_max:
                    break
                # Double-checking constraints, might not be necessary
                if abs(s_cost) <= max and s_cost != 0:
                    # Set first matched stop character to
                    # lower bound for searching
                    if found_first_stop == False:
                        found_first_stop = True
                        first_stop = (stop_index)
                    f.add(Edge(s, s_cost))
                    edge_count += 1
                stop_index += 1

            # Do the same for the delimiters.
            found_first_delim = False
            delim_index = first_delim
            for d in self.delims[first_delim:]:
                d_pos = d.pos
                d_cost = f_pos - d_pos
                d_takeaway = 0
                if d_cost < neg_max:
                    break
                for e in f.edges:
                    e_pos = e.pos
                    # If this edge is between the delimiter and the focal
                    # character, and it's a stopword, we'll need to account
                    # for the position difference since stopwords are (sometimes)
                    # to be considered the same as whitespace.
                    if (((e_pos > d_pos and e_pos < f_pos) or
                         (e_pos < d_pos and e_pos > f_pos))
                            and (e.cc == stopword)):
                        d_takeaway += 1
                # Decrease the absolute cost
                if d_cost < 0:
                    d_cost += d_takeaway
                elif d_cost > 0:
                    d_cost -= d_takeaway
                if abs(d_cost) <= max and d_cost != 0:
                    if found_first_delim == False:
                        found_first_delim = True
                        first_delim = (delim_index)
                    f.add(Edge(d, d_cost))
                    edge_count += 1
                delim_index += 1

            # Now we can calculate the compares by the distance ignoring
            # stopwords and delimiters, giving a better true distance
            found_first_compare = False
            compare_index = first_compare
            for c in self.compares[first_compare:]:
                c_pos = c.pos
                c_cost = f_pos - c_pos
                if c_cost < neg_max:
                    break
                takeaway = 0
                for e in f.edges:
                    e_pos = e.pos
                    # Similarly to above, both stopwords and delimiters
                    # are considered to not count towards edge costs for
                    # comparison characters.
                    if (((e_pos > c_pos and e_pos < f_pos) or
                         (e_pos < c_pos and e_pos > f_pos))
                            and (e.cc == stopword or e.cc == delim)):
                        takeaway += 1
                if c_cost < 0:
                    c_cost += takeaway
                elif c_cost > 0:
                    c_cost -= takeaway
                if abs(c_cost) <= max and c_cost != 0:
                    if found_first_compare == False:
                        found_first_compare = True
                        first_compare = (compare_index)
                    f.add(Edge(c, c_cost))
                    edge_count += 1
                compare_index += 1
            gc.enable()
        log("Edge count was " + str(edge_count))
Exemplo n.º 22
0
 def printEdge(self):
     log("\t\t#### Edge ####")
     log("\t\tClass: " + self.cc)
     log("\t\tId: " + self.id)
     log("\t\tCost: " + str(self.cost))
     log("\t\tAbsolute cost: " + str(abs(self.cost)) + "\n")
Exemplo n.º 23
0
 def getClass(self,id):
     for c in self.classes:
         if c.id == id:
             return c
     log('No class found for ' + id)
     return None
Exemplo n.º 24
0
 def printNode(self):
     log("\t#### Node ####")
     log("\tClass: " + self.cc.id)
     log("\tKey: " + self.key)
Exemplo n.º 25
0
 def printNode(self):
     log("\t#### Node ####")
     log("\tClass: " + self.cc.id)
     log("\tKey: " + self.key)
     for e in self.edges:
         e.printEdge()
Exemplo n.º 26
0
 def do_jobs(self,line):
     '''Quick command for do all in job_batch function.'''
     for j in self.jobs:
         self.run_profile(j[0], j[1], j[2], j[3], j[4])
     log('Done running job batch.')
Exemplo n.º 27
0
 def generateEdges(self):
     log("Generating edges for " + self.id)
     max = self.maxcost
     neg_max = (-1 * max)
     edge_count = 0
     # The list position of the first found element so we don't need to
     # keep checking the beginning of the list when abs(cost) > maxcost
     first_stop = 0
     first_delim = 0
     first_compare = 0
     # Optimizations tricks
     stopword = self.stopword.id
     delim = self.delim.id
     for f in self.focals:
         gc.disable()
         f_pos = f.pos
         # For each newly-minted focal node, determine the distance to
         # each stopword if the node is within maxcost absolute distance.
         # This is because we don't want to count these words towards the
         # distance of future nodes.
         found_first_stop = False
         stop_index = first_stop
         for s in self.stopwords[first_stop:]:
             s_cost = f_pos - s.pos
             # Stop at upper searching bound
             if s_cost < neg_max:
                 break
             # Double-checking constraints, might not be necessary
             if abs(s_cost) <= max and s_cost != 0:
                 # Set first matched stop character to
                 # lower bound for searching
                 if found_first_stop == False:
                     found_first_stop = True
                     first_stop = (stop_index)
                 f.add(Edge(s,s_cost))
                 edge_count += 1
             stop_index += 1
     
         # Do the same for the delimiters.
         found_first_delim = False
         delim_index = first_delim
         for d in self.delims[first_delim:]:
             d_pos = d.pos
             d_cost = f_pos - d_pos
             d_takeaway = 0
             if d_cost < neg_max:
                 break
             for e in f.edges:
                 e_pos = e.pos
                 # If this edge is between the delimiter and the focal
                 # character, and it's a stopword, we'll need to account
                 # for the position difference since stopwords are (sometimes)
                 # to be considered the same as whitespace.
                 if (((e_pos > d_pos and e_pos < f_pos) or
                         (e_pos < d_pos and e_pos > f_pos)) and
                             (e.cc == stopword)):
                     d_takeaway += 1
             # Decrease the absolute cost
             if d_cost < 0:
                 d_cost += d_takeaway
             elif d_cost > 0:
                 d_cost -= d_takeaway
             if abs(d_cost) <= max and d_cost != 0:
                 if found_first_delim == False:
                     found_first_delim = True
                     first_delim = (delim_index)
                 f.add(Edge(d,d_cost))
                 edge_count += 1
             delim_index += 1
         
         # Now we can calculate the compares by the distance ignoring
         # stopwords and delimiters, giving a better true distance
         found_first_compare = False
         compare_index = first_compare
         for c in self.compares[first_compare:]:
             c_pos = c.pos
             c_cost = f_pos - c_pos
             if c_cost < neg_max:
                 break
             takeaway = 0
             for e in f.edges:
                 e_pos = e.pos
                 # Similarly to above, both stopwords and delimiters
                 # are considered to not count towards edge costs for
                 # comparison characters.
                 if (((e_pos > c_pos and e_pos < f_pos) or
                      (e_pos < c_pos and e_pos > f_pos)) and
                         (e.cc == stopword or
                          e.cc == delim)):
                     takeaway += 1
             if c_cost < 0:
                 c_cost += takeaway
             elif c_cost > 0:
                 c_cost -= takeaway
             if abs(c_cost) <= max and c_cost != 0:
                 if found_first_compare == False:
                     found_first_compare = True
                     first_compare = (compare_index)
                 f.add(Edge(c,c_cost))
                 edge_count += 1
             compare_index += 1
         gc.enable()
     log("Edge count was " + str(edge_count))
Exemplo n.º 28
0
 def countAllInSentence(self):
     log("Started counting in sentence for " + self.id)
     count = 0
     for f in self.focals:
         count += self.countInSentence(f)
     return count