def load_data(input_file): raw_data = open(input_file,'r') doc_list = [] doc = Document() ne_dict = {} for rline in raw_data.readlines(): if rline.strip(): i = 0 entry = rline.split() if len(entry) == 14: i = 1 docID = entry[i] #new document if docID != doc.docID: #import pdb #if doc.docID!='': # pdb.set_trace() #record the name entity dictionary we have created doc.set_ne_dict(ne_dict) doc_list.append(doc) ne_dict = {} doc = Document(docID) first = Entity(entry[i+1],(entry[i+2],entry[i+3]),entry[i+4],entry[i+5],entry[i+6]) ne_dict[entry[i+5]] = (entry[i+1],entry[i+2]) second = Entity(entry[i+7],(entry[i+8],entry[i+9]),entry[i+10],entry[i+11],entry[i+12]) ne_dict[entry[i+11]] = (entry[i+7],entry[i+8]) pair = Pair(first,second) if i: pair.set_label(entry[0]) doc.add_pair(pair) else: first = Entity(entry[i+1],(entry[i+2],entry[i+3]),entry[i+4],entry[i+5],entry[i+6]) ne_dict[entry[i+5]] = (entry[i+1],entry[i+2]) second = Entity(entry[i+7],(entry[i+8],entry[i+9]),entry[i+10],entry[i+11],entry[i+12]) ne_dict[entry[i+11]] = (entry[i+7],entry[i+8]) pair = Pair(first,second) if i: pair.set_label(entry[0]) doc.add_pair(pair) doc.set_ne_dict(ne_dict) doc_list.append(doc) return doc_list
def DoMark(self, request, context): """ gec服务的处理函数 Args: request: rpc client发送过来的请求 context: 没啥卵用 Returns: gec处理后的文本和相应的错误信息,json字符串格式 """ gec_output = json.loads(request.marker_input.strip()) doc_sents = [ gec_output["sentence_" + str(i)]["corr_sent"] for i in range(gec_output["sent_nums"]) ] doc_encodes = self.__bert_predictor.encode(doc_sents) prompt_sents = sentence_tokenize(gec_output["title"]) prompt_encodes = self.__bert_predictor.encode(prompt_sents) # article_id, article_set在推理的时候并没有什么作用,只是为了数据类型和train保持一致 features = dict() features["doc_encodes"] = np.expand_dims(doc_encodes, 0) features["article_set"] = np.array([9], np.int64) features["domain1_score"] = np.array([0], np.float32) features["article_id"] = np.array([0], np.int64) features["doc_sent_num"] = np.array([gec_output["sent_nums"]], np.int64) features["prompt_encodes"] = np.expand_dims(prompt_encodes, 0) try: bsp_score = self.__bsp_stub.Predict( self.__format_request("bsp", features), 10) bsp_score = bsp_score.outputs["batch_scores"].float_val[0] csp_score = self.__csp_stub.Predict( self.__format_request("csp", features), 10) csp_score = csp_score.outputs["batch_scores"].float_val[0] psp_score = self.__psp_stub.Predict( self.__format_request("psp", features), 10) psp_score = psp_score.outputs["batch_scores"].float_val[0] temp_doc = Document(gec_output) handcrafted_features = temp_doc.features doc_result = temp_doc.doc_result handcrafted_features.extend([bsp_score, csp_score, psp_score]) dtest = xgb.DMatrix(handcrafted_features) overall_score = self.__bst.predict(dtest)[0] doc_result["score_lexical"] = bsp_score doc_result["score_coherence"] = csp_score doc_result["score_gramatical"] = bsp_score doc_result["score_task"] = psp_score doc_result["score_summary"] = float(overall_score) gec_output["score_result"] = doc_result return rpc_server_pb2.MarkerData( marker_output=json.dumps(gec_output)) except: raise ValueError("")
def load_data(input_file): raw_data = open(input_file,'r') doc_list = [] doc = Document() for rline in raw_data.readlines(): if rline.strip(): entry = rline.split() docID = entry[0] if docID != doc.docID: #import pdb #if doc.docID!='': # pdb.set_trace() doc_list.append(doc) doc = Document(docID) first = Entity(entry[1],(entry[2],entry[3]),entry[4],entry[5]) second = Entity(entry[6],(entry[7],entry[8]),entry[9],entry[10]) pair = Pair(first,second) if len(entry) == 12: pair.set_label(entry[11]) doc.add_pair(pair) else: first = Entity(entry[1],(entry[2],entry[3]),entry[4],entry[5]) second = Entity(entry[6],(entry[7],entry[8]),entry[9],entry[10]) pair = Pair(first,second) if len(entry) == 12: pair.set_label(entry[11]) doc.add_pair(pair) doc_list.append(doc) return doc_list
def loadIndex(self, path): """ Loads the self.documents and self.invertedIndex dicts data from a file created with the self.saveIndex method. If it fails to open the file the exception is not handled. param path: string containing the path to file to load. return: None. """ print("Loading index from the file: {}".format(path)) fin = open(path) # regex for parsing the self.documents docRegex = re.compile( r"(?P<id>\d+);(?P<year>\d+);(?P<title>.+);(?P<authors>.+)?;(?P<norm>.+)" ) # parsing the documents data for line in fin: line = line.strip() # if there's something in the line it must be data about a document if line: # if the line starts with a # ignore it if not line.startswith("#"): match = docRegex.match(line) docId = int(match.group("id")) year = match.group("year") title = match.group("title") authors = match.group("authors") norm = float(match.group("norm")) self.documents[docId] = Document(docId, year, title, authors, norm) # if there's an empty line the documents part have ended else: break # regex for parsing the self.invertedIndex indexRegex = re.compile(r"(?P<word>.+);(?P<idf>.+);(?P<lst>.+)") # parsing the inverted index data for line in fin: line = line.strip() if not line.startswith("#"): match = indexRegex.match(line) word = match.group("word") idf = float(match.group("idf")) lst = ast.literal_eval(match.group("lst")) pair = (idf, lst) self.invertedIndex[word] = pair fin.close()
def treatLastDoc(self, lastDoc): """ Helper method that transforms the data in the lastDoc dict into a tuple of util.Document object and a Counter containing the frequencies of the words in the document param lastDoc: a dict containing the data parsed. return: a tuple(util.Document, collections.Counter). The counter is a dict with word keys and frequency values. """ total = Counter() # the list of relevant attributes to tokenize. Tokenize also # removes stop words defined in the init method relevant = ["TI", "AB", "EX", "MJ", "MN"] for attr in relevant: content = lastDoc[attr] assert type(content) == str words = self.tokenize(content) counter = Counter(words) total += counter # form the Document object return docId = int(lastDoc["RN"]) # get the year of publishment regex = r"(?P<year>\d{2})(?P<idInYear>\d{3})" sep = re.compile(regex) match = sep.match(lastDoc["PN"]) year = int(match.group("year")) title = lastDoc["TI"] authors = lastDoc["AU"] tempNorm = 0 # irrelevant norm to be udated in the future doc = Document(docId, year, title, authors, tempNorm) result = doc, total return result