Exemplos de preprocessText em Python, exemplos de naruhodo.utils.misc.preprocessText em Python

Exemplo n.º 1

0

Exibir arquivo

 def _wvResolve(self, proname, flatEntityList):
     """Resolve using word vector similarities."""
     ret = ""
     snames = list()
     svecs = list()
     sim = 0.
     for key in self.G.successors(proname):
         name = preprocessText(key)
         if name in self.wv:
             snames.append(name)
             svecs.append(self.wv[name])
         for key2 in self.G.successors(key):
             name = preprocessText(key2)
             if name in self.wv:
                 snames.append(name)
                 svecs.append(self.wv[name])
     if len(svecs) > 0:
         for item in flatEntityList:
             rawitem = preprocessText(item)
             if rawitem not in snames and rawitem in self.wv:
                 score = harmonicSim(svecs, self.wv[rawitem])
                 if sim < score:
                     sim = score
                     ret = item
         if sim > 0.7:
             return ret
         else:
             return ""        
     else:
         return ""

Exemplo n.º 2

0

Exibir arquivo

 def _addAllMP(self, inps):
     """Parallel implementation of addAll function."""
     if self.lang == "ja":
         if self.gtype == "d":
             inps = [[self.pos + x, preprocessText(inps[x])] for x in range(len(inps))]
             results = self.pool.starmap(self._addMP_ja_d, inps)
         elif self.gtype == "k":
             inps = [[self.pos + x, preprocessText(inps[x]), self.autosub] for x in range(len(inps))]
             results = self.pool.starmap(self._addMP_ja_k, inps)
         else:
             raise ValueError("Unknown graph type: {0}".format(self.gtype))
     else:
         raise ValueError("Unsupported language: {0}".format(self.lang))
     self.pos += len(inps)
     final = self._reduce(results)
     self.G = _mergeGraph(self.G, final[0])
     self.entityList = _mergeEntityList(self.entityList, final[1])
     self.proList = _mergeProList(self.proList, final[2])

Exemplo n.º 3

0

Exibir arquivo

 def resolveSynonym(self):
     """Resolve synonyms in the given text."""
     # initialize a graph of synonym
     GS = nx.Graph()
     # Get flatten entity list
     flatEntityList = list()
     # Add person and organization to flatEntityList
     for i in [1, 3]:
         for key in self.entityList[i].keys():
             flatEntityList.append(key)
     # Find syntatic synonyms
     for i in range(len(flatEntityList)):
         for j in range(i + 1, len(flatEntityList)):
             A = preprocessText(flatEntityList[i])
             B = preprocessText(flatEntityList[j])
             inc = inclusive(A, B)
             if not self.wv:
                 sim = 1.
                 print("Word vector model is not set correctly. Skipping part of coreference resolution.")
             else:
                 if A in self.wv and B in self.wv:
                     sim = cosSimilarity(self.wv[A], self.wv[B])
                 else:
                     sim = 1.
             if inc == 1 and sim > 0.5:
                 # self.G.nodes[flatEntityList[i]]['count'] += 1
                 GS.add_edge(flatEntityList[i], flatEntityList[j])
                 self.G.add_edge(flatEntityList[i], flatEntityList[j], weight=1, label="同義語候補", type="synonym")
             elif inc == -1 and sim > 0.5:
                 # self.G.nodes[flatEntityList[j]]['count'] += 1
                 GS.add_edge(flatEntityList[i], flatEntityList[j])
                 self.G.add_edge(flatEntityList[j], flatEntityList[i], weight=1, label="同義語候補", type="synonym")
     # Process GS
     for subG in nx.connected_components(GS):
         lshort = 10000
         nshort = ""
         for node in subG:
             if lshort > len(node):
                 lshort = len(node)
                 nshort = node
         self.synonymDict.add(nshort)
         for node in subG:
             self.G.nodes[node]['synonym'] = nshort
     return flatEntityList

Exemplo n.º 4

0

Exibir arquivo

 def _processMeaningless(self):
     """This function makes meaningless words tagged with its meaning."""
     nck = len(self.chunks)
     for i in range(nck):
         if preprocessText(self.chunks[i].main) in MeaninglessDict:
             if len(self.childrenList[i]) > 0:
                 self.chunks[i].meaning = self.chunks[self.childrenList[i]
                                                      [-1]].main
                 self.chunks[i].main = "({0})\n{1}".format(
                     self.chunks[self.childrenList[i][-1]].surface,
                     self.chunks[i].main)

Exemplo n.º 5

0

Exibir arquivo

 def _processNegative(self):
     """This function makes the words that has negative child tagged negative."""
     nck = len(self.chunks)
     for i in range(nck):
         if preprocessText(self.chunks[i].main) in [
                 "ない",
         ]:
             if len(self.childrenList[i]) > 0:
                 self.chunks[self.childrenList[i][-1]].main += "\n(否定)"
                 self.chunks[self.childrenList[i][-1]].negative = 1
                 self.chunks[i].meaning = self.chunks[self.childrenList[i]
                                                      [-1]].main
             self.chunks[i].main = self.chunks[i].main.replace("\n(否定)", "")

Exemplo n.º 6

0

Exibir arquivo

 def add(self, inp):
     """Add a sentence to graph."""
     inp = preprocessText(inp)
     if inp == "":
         return [inp]
     self.core.add(inp, self.pos)
     self.pos += 1
     self.G = _mergeGraph(self.G, self.core.G)
     self.core.G.clear()
     self.entityList = _mergeEntityList(self.entityList, self.core.entityList)
     self.core.entityList = [dict() for x in range(len(NEList))]
     self.proList = _mergeProList(self.proList, self.core.proList)
     self.core.proList = list()
     flatEntityList = None
     if self.synonym:
         flatEntityList = self.resolveSynonym()
     if self.coref:
         self.resolveCoref(flatEntityList)
     return [inp]

Exemplo n.º 7

0

Exibir arquivo

 def addUrls(self, urls):
     """Add the information from given urls to KSG."""
     context = self._grabTextFromUrls(urls)
     self.addAll(context)
     return [preprocessText(item) for item in context]

Exemplo n.º 8

0

Exibir arquivo

 def _addEntity(self, pid, chunks):
     """Add parent nodes that are nouns."""
     parent = chunks[pid]
     sub = None
     # Find subject
     for i in range(len(parent.children)):
         child = chunks[parent.children[i]]
         if child.func in SubDict:
             sub = child
             if child.func == "では":
                 if child.negative != 0 or any([val.negative != 0 for key, val in self.G.successors(child.main)]):
                     pass
                 else:
                     sub = None
     if sub:
         self._addNode(parent, sub=sub.main)
         self._addEdge(sub.main, parent.main, label="陳述", etype="stat")
     else:
         self._addNode(parent)
     
     # Lopp through all children
     for i in range(len(parent.children)):
         child = chunks[parent.children[i]]
         # If child is noun
         if child.func in SubDict:
             if child.func == "では":
                 if child.negative != 0 or any([val.negative != 0 for key, val in self.G.successors(child.main)]):
                     pass
                 else:
                     self._addNode(child)
                     self._addEdge(child.main, parent.main, label=child.func, etype="attr")
         elif child.type == 0 and child.func in ["と", "などと"] and child.id + 1 == parent.id and preprocessText(chunks[parent.parent].main) not in ["交代", "交換"]:
             self._addNode(child)
             self._addEdge(child.main, parent.main, label="並列", etype="para")
             self._addEdge(parent.main, child.main, label="並列", etype="para")
             self.para.append([child.main, parent.main])
         elif child.type == 0 and child.func in ParallelDict and child.id + 1 == parent.id:
             self._addNode(child)
             self._addEdge(child.main, parent.main, label="並列", etype="para")
             self._addEdge(parent.main, child.main, label="並列", etype="para")
             self.para.append([child.main, parent.main])
         else:
             self._addNode(child)
             self._addEdge(child.main, parent.main, label=child.func, etype="attr")