def OrganizeLex(lexiconLocation, _CommentDict, _LexiconDict): with open(lexiconLocation, encoding='utf-8') as dictionary: oldWord = "firstCommentLine" for line in dictionary: if line.startswith("//"): if _CommentDict.get(oldWord): _CommentDict.update({oldWord:_CommentDict.get(oldWord)+line}) else: _CommentDict.update({oldWord: line}) continue code, comment = utils.SeparateComment(line) blocks = [x.strip() for x in re.split(":", code) if x] if len(blocks) != 2: continue newNode = False node = SearchLexicon(blocks[0], 'origin') # node = None if not node: newNode = True node = LexiconNode(blocks[0]) if "_" in node.text: node.forLookup = True #for those combination words. if comment: node.comment = comment # else: # logging.debug("This word is repeated in lexicon: %s" % blocks[0]) features, node = SplitFeaturesWithSemicolon(blocks[1], node) for feature in features: if re.match('^\'.*\'$', feature): node.norm = feature.strip('\'') elif re.match('^/.*/$', feature): node.atom = feature.strip('/') elif re.search(u'[\u4e00-\u9fff]', feature): node.norm = feature continue else: featureID = GetFeatureID(feature) if featureID == -1: logging.info("Missing Feature: " + feature) if not feature.startswith("\\"): node.missingfeature += "\\" + feature else: node.missingfeature = feature node.features.add(featureID) ontologynode = SearchFeatureOntology(featureID) if ontologynode: ancestors = ontologynode.ancestors if ancestors: node.features.update(ancestors) if newNode: _LexiconDict.update({node.text: node}) # logging.debug(node.word) oldWord = blocks[0] logging.info("Finish loading lexicon" + lexiconLocation)
def LoadLexiconFilterlist(BlacklistLocation): if BlacklistLocation.startswith("."): BlacklistLocation = os.path.join(os.path.dirname(os.path.realpath(__file__)), BlacklistLocation) with open(BlacklistLocation, encoding="utf-8") as dictionary: for lined in dictionary: word, _ = utils.SeparateComment(lined) if word: _LexiconFilterSet.add(word)
def LoadAppendixList(featureOncologyLocation): Folder = os.path.dirname(featureOncologyLocation) NoShowFileLocation = os.path.join(Folder, "featureNotShow.txt") with open(NoShowFileLocation, encoding="utf-8") as dictionary: for line in dictionary: word, _ = utils.SeparateComment(line) if not word: continue NotShowList.append(GetFeatureID(word)) NoCopyFileLocation = os.path.join(Folder, "featureNotCopy.Parser.txt") with open(NoCopyFileLocation, encoding="utf-8") as dictionary: for line in dictionary: word, _ = utils.SeparateComment(line) if not word: continue NotCopyList.append(GetFeatureID(word))
def LoadTopCharacters(FileLocation): Top500 = "" with open(FileLocation, encoding="utf-8") as dictionary: for lined in dictionary: characters, _ = utils.SeparateComment(lined) if not characters: continue Top500 += characters return Top500[:100], Top500
def AlignMain(): newloc = "outputMain.txt" with open(newloc, 'w',encoding='utf-8') as file: with open(paraMain, encoding='utf-8') as dictionary: for line in dictionary: if line.startswith("//"): file.write(line) continue code, comment = utils.SeparateComment(line) if (code not in _LexiconDictB.keys()) and (code not in _LexiconDictP.keys()) and (code not in _LexiconDictL.keys()) and (code not in _LexiconDictI.keys()) and (code not in _LexiconDictI4.keys()) and (code not in _LexiconDictLexX.keys()) and (code not in _LexiconDictDefX.keys()): file.write(code + " " + comment + "\n") shutil.move(newloc,paraMain)
def LoadFeatureSet(featureOncologyLocation): global _FeatureList, _FeatureDict, _FeatureSet _FeatureSet.clear() with open(featureOncologyLocation, encoding="utf-8") as dictionary: for line in dictionary: code, __ = utils.SeparateComment(line) features = [x.strip() for x in re.split("[,;=\s]", code) if x] for feature in features: if re.match('^\'.*\'$', feature) or re.match( '^/.*/$', feature): continue _FeatureSet.add(feature) _FeatureList = list(sorted(_FeatureSet)) _FeatureDict = {f: ID for ID, f in enumerate(_FeatureList)}
def LoadLexiconBlacklist(BlacklistLocation): if BlacklistLocation.startswith("."): BlacklistLocation = os.path.join(os.path.dirname(os.path.realpath(__file__)), BlacklistLocation) with open(BlacklistLocation, encoding="utf-8") as dictionary: for lined in dictionary: content, _ = utils.SeparateComment(lined) if not content: continue if " " in content or "\t" in content: spaceindex = content.find(" ") if spaceindex < 0: spaceindex = content.find("\t") _word = content[:spaceindex] + "$" _freq = int(content[spaceindex+1:]) else: _word = content[0] + "$" _freq = Freq_Basic_Blacklist _Blacklist_Freq[_word] = _freq
def SetAncestors(self, line): code, comment = utils.SeparateComment(line) self.Comment = comment code = self.ProcessAliasInFeatureFile(code) if len(code) == 0: return features = [x.strip() for x in re.split("[,; ]", code) if x] openWord = features[0] openWordID = GetFeatureID(openWord) TryOldNode = SearchFeatureOntology(openWordID) if TryOldNode: if len(features) > 1: for feature in features[1:]: TryOldNode.ancestors.add(GetFeatureID(feature)) else: self.openWord = openWord self.openWordID = openWordID if len(features) > 1: for feature in features[1:]: fid = GetFeatureID(feature) self.ancestors.add(fid)
level = logging.INFO for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(level=level, format='%(asctime)s [%(levelname)s] %(message)s') UnitTest = {} if not os.path.exists(args.inputfile): print("Unit Test file " + args.inputfile + " does not exist.") exit(0) with open(args.inputfile, encoding="utf-8") as RuleFile: for line in RuleFile: if line.strip(): Content, _ = utils.SeparateComment(line.strip()) if Content and '\t' in Content: # For the testfile that only have test sentence, not rule name TestSentence, Sales = Content.split('\t', 2) UnitTest[TestSentence] = int(float(Sales)) for Sentence in UnitTest: LexicalAnalyzeURL = utils.ParserConfig.get( "main", "url_larestfulservice") + "/LexicalAnalyze?Type=json&Sentence=" ret = requests.get(LexicalAnalyzeURL + "\"" + Sentence + "\"") root = jsonpickle.decode(ret.text) for s in root['sons']: # ignore the root AccumulateNodes(s) #AccumulateNodes(root)
def OutputFeatureOntologyGraph(): #output = "//***Ontology***" + "\n" if not hasattr(OutputFeatureOntologyGraph, "graph"): from collections import defaultdict OutputFeatureOntologyGraph.outbound = defaultdict(int) OutputFeatureOntologyGraph.inbound = defaultdict(int) OutputFeatureOntologyGraph.nodeset = set() OutputFeatureOntologyGraph.graph = set() PipeLineLocation = utils.ParserConfig.get("main", "Pipelinefile") XLocation = os.path.dirname(PipeLineLocation) with open(XLocation + '/../Y/feature.txt', encoding="utf-8") as dictionary: for line in dictionary: code, comment = utils.SeparateComment(line) if "," not in code: continue #no edge. ignore OpenWord, ancestors = code.split(",", 1) OpenWordID = GetFeatureID(OpenWord.split( "=", 1)[0].strip()) #remove the alias. if OpenWordID == -1: logging.warning( "OutputFeatureOntologyGraph: wrong word ID for line {}." .format(code)) continue for path in ancestors.split(";"): prev = OpenWordID for node in path.split(","): if node.strip(): parentid = GetFeatureID(node.strip()) if parentid == -1: logging.warning( "OutputFeatureOntologyGraph: wrong parentid for node {}" .format(node)) continue if (prev, parentid ) not in OutputFeatureOntologyGraph.graph: OutputFeatureOntologyGraph.graph.add( (prev, parentid)) OutputFeatureOntologyGraph.outbound[prev] += 1 OutputFeatureOntologyGraph.inbound[ parentid] += 1 OutputFeatureOntologyGraph.nodeset.add(prev) OutputFeatureOntologyGraph.nodeset.add( parentid) prev = GetFeatureID(node.strip()) output = "{\n" for node in sorted(OutputFeatureOntologyGraph.nodeset): output += "{} [label=\"{}\" tooltip=\"Inbound:{} Outbound:{} \" ];\n".format( node, GetFeatureName(node), OutputFeatureOntologyGraph.inbound[node], OutputFeatureOntologyGraph.outbound[node]) for edge in sorted(OutputFeatureOntologyGraph.graph, key=operator.itemgetter(0, 1)): #output += GetFeatureName(edge[0]) + "->" + GetFeatureName(edge[1]) + "\n" output += "\t{}->{} ;\n".format(edge[0], edge[1]) output += "}\n" logging.info( "In Feature ontology, There are {} edges, for {} nodes.".format( len(OutputFeatureOntologyGraph.graph), len(OutputFeatureOntologyGraph.nodeset))) return output