예제 #1
0
def load_data(input_file):
    
    raw_data = open(input_file,'r')

    doc_list = []
    doc = Document()
    ne_dict = {}
    for rline in raw_data.readlines():
        if rline.strip():
            i = 0
            entry = rline.split()
            if len(entry) == 14:
                i = 1
            docID = entry[i]
            #new document
            if docID != doc.docID:
                #import pdb
                #if doc.docID!='':
                #    pdb.set_trace()
                
                #record the name entity dictionary we have created
                doc.set_ne_dict(ne_dict)
                doc_list.append(doc)
                ne_dict = {}
                doc = Document(docID)
                first = Entity(entry[i+1],(entry[i+2],entry[i+3]),entry[i+4],entry[i+5],entry[i+6])
                ne_dict[entry[i+5]] = (entry[i+1],entry[i+2]) 
                second = Entity(entry[i+7],(entry[i+8],entry[i+9]),entry[i+10],entry[i+11],entry[i+12])
                ne_dict[entry[i+11]] = (entry[i+7],entry[i+8])
                pair = Pair(first,second)
                if i:
                    pair.set_label(entry[0])
                doc.add_pair(pair)
            else:
                first = Entity(entry[i+1],(entry[i+2],entry[i+3]),entry[i+4],entry[i+5],entry[i+6])
                ne_dict[entry[i+5]] = (entry[i+1],entry[i+2]) 
                second = Entity(entry[i+7],(entry[i+8],entry[i+9]),entry[i+10],entry[i+11],entry[i+12])
                ne_dict[entry[i+11]] = (entry[i+7],entry[i+8])
                pair = Pair(first,second)
                if i:
                    pair.set_label(entry[0])
                doc.add_pair(pair)
    doc.set_ne_dict(ne_dict)
    doc_list.append(doc)
    return doc_list
예제 #2
0
    def DoMark(self, request, context):
        """ gec服务的处理函数

        Args:
            request: rpc client发送过来的请求
            context: 没啥卵用

        Returns: gec处理后的文本和相应的错误信息,json字符串格式

        """
        gec_output = json.loads(request.marker_input.strip())
        doc_sents = [
            gec_output["sentence_" + str(i)]["corr_sent"]
            for i in range(gec_output["sent_nums"])
        ]
        doc_encodes = self.__bert_predictor.encode(doc_sents)
        prompt_sents = sentence_tokenize(gec_output["title"])
        prompt_encodes = self.__bert_predictor.encode(prompt_sents)

        # article_id, article_set在推理的时候并没有什么作用,只是为了数据类型和train保持一致
        features = dict()
        features["doc_encodes"] = np.expand_dims(doc_encodes, 0)
        features["article_set"] = np.array([9], np.int64)
        features["domain1_score"] = np.array([0], np.float32)
        features["article_id"] = np.array([0], np.int64)
        features["doc_sent_num"] = np.array([gec_output["sent_nums"]],
                                            np.int64)
        features["prompt_encodes"] = np.expand_dims(prompt_encodes, 0)
        try:
            bsp_score = self.__bsp_stub.Predict(
                self.__format_request("bsp", features), 10)
            bsp_score = bsp_score.outputs["batch_scores"].float_val[0]
            csp_score = self.__csp_stub.Predict(
                self.__format_request("csp", features), 10)
            csp_score = csp_score.outputs["batch_scores"].float_val[0]
            psp_score = self.__psp_stub.Predict(
                self.__format_request("psp", features), 10)
            psp_score = psp_score.outputs["batch_scores"].float_val[0]

            temp_doc = Document(gec_output)
            handcrafted_features = temp_doc.features
            doc_result = temp_doc.doc_result
            handcrafted_features.extend([bsp_score, csp_score, psp_score])
            dtest = xgb.DMatrix(handcrafted_features)
            overall_score = self.__bst.predict(dtest)[0]
            doc_result["score_lexical"] = bsp_score
            doc_result["score_coherence"] = csp_score
            doc_result["score_gramatical"] = bsp_score
            doc_result["score_task"] = psp_score
            doc_result["score_summary"] = float(overall_score)
            gec_output["score_result"] = doc_result
            return rpc_server_pb2.MarkerData(
                marker_output=json.dumps(gec_output))
        except:
            raise ValueError("")
예제 #3
0
def load_data(input_file):
    
    raw_data = open(input_file,'r')

    doc_list = []
    doc = Document()
    for rline in raw_data.readlines():
        if rline.strip():
            entry = rline.split()
            docID = entry[0]
            if docID != doc.docID:
                #import pdb
                #if doc.docID!='':
                #    pdb.set_trace()
                doc_list.append(doc)
                doc = Document(docID)
                first = Entity(entry[1],(entry[2],entry[3]),entry[4],entry[5])
                second = Entity(entry[6],(entry[7],entry[8]),entry[9],entry[10])
                pair = Pair(first,second)
                if len(entry) == 12:
                    pair.set_label(entry[11])
                doc.add_pair(pair)
            else:
                first = Entity(entry[1],(entry[2],entry[3]),entry[4],entry[5])
                second = Entity(entry[6],(entry[7],entry[8]),entry[9],entry[10])
                pair = Pair(first,second)
                if len(entry) == 12:
                    pair.set_label(entry[11])
                doc.add_pair(pair)
                
    doc_list.append(doc)
    return doc_list
예제 #4
0
    def loadIndex(self, path):
        """
        Loads the self.documents and self.invertedIndex dicts data from a file
        created with the self.saveIndex method.

        If it fails to open the file the exception is not handled.

        param path: string containing the path to file to load.
        return: None.
        """
        print("Loading index from the file: {}".format(path))
        fin = open(path)

        # regex for parsing the self.documents
        docRegex = re.compile(
            r"(?P<id>\d+);(?P<year>\d+);(?P<title>.+);(?P<authors>.+)?;(?P<norm>.+)"
        )
        # parsing the documents data
        for line in fin:
            line = line.strip()
            # if there's something in the line it must be data about a document
            if line:
                # if the line starts with a # ignore it
                if not line.startswith("#"):
                    match = docRegex.match(line)
                    docId = int(match.group("id"))
                    year = match.group("year")
                    title = match.group("title")
                    authors = match.group("authors")
                    norm = float(match.group("norm"))

                    self.documents[docId] = Document(docId, year, title,
                                                     authors, norm)
            # if there's an empty line the documents part have ended
            else:
                break

        # regex for parsing the self.invertedIndex
        indexRegex = re.compile(r"(?P<word>.+);(?P<idf>.+);(?P<lst>.+)")
        # parsing the inverted index data
        for line in fin:
            line = line.strip()
            if not line.startswith("#"):
                match = indexRegex.match(line)
                word = match.group("word")
                idf = float(match.group("idf"))
                lst = ast.literal_eval(match.group("lst"))
                pair = (idf, lst)
                self.invertedIndex[word] = pair
        fin.close()
예제 #5
0
    def treatLastDoc(self, lastDoc):
        """
        Helper method that transforms the data in the lastDoc dict into a tuple
        of util.Document object and a Counter containing the frequencies of the
        words in the document

        param lastDoc: a dict containing the data parsed.
        return: a tuple(util.Document, collections.Counter). The counter is a
        dict with word keys and frequency values.
        """
        total = Counter()

        # the list of relevant attributes to tokenize. Tokenize also
        # removes stop words defined in the init method
        relevant = ["TI", "AB", "EX", "MJ", "MN"]
        for attr in relevant:
            content = lastDoc[attr]
            assert type(content) == str
            words = self.tokenize(content)
            counter = Counter(words)
            total += counter

        # form the Document object return
        docId = int(lastDoc["RN"])

        # get the year of publishment
        regex = r"(?P<year>\d{2})(?P<idInYear>\d{3})"
        sep = re.compile(regex)
        match = sep.match(lastDoc["PN"])
        year = int(match.group("year"))

        title = lastDoc["TI"]
        authors = lastDoc["AU"]
        tempNorm = 0  # irrelevant norm to be udated in the future
        doc = Document(docId, year, title, authors, tempNorm)

        result = doc, total
        return result