def processDoc(line, dicts, config=defualtconfig):
    inverted_list = dicts[0]
    inverted_index = dicts[1]
    entity_tokennum = dicts[2]
    inverted_list_len = dicts[3]
    entity_realid = dicts[4]
    entity_real = dicts[5]
    maxenl = dicts[6]

    threshold = config["threshold"]
    docfileds = config["document"]["value_attribute"]
    n = config["token_size"]

    documentId = line[config["document"]["id_attribute"]]
    document_real = line[docfileds[0]]
    for filed in docfileds[1:]:
        document_real += " " + line[filed]

    jsonline = {}
    document = document_real.lower().strip().replace(" ", "")
    tokens = list(ngrams(document, n))
    heap = []
    keys = []
    los = len(tokens)
    # build the heap
    for i, token in enumerate(tokens):
        key = "".join(token)
        keys.append(key)
        try:
            heap.append([inverted_list[key][0], i])
        except KeyError:
            pass
    if heap:
        return_values_from_c = singleheap.getcandidates(
            heap, entity_tokennum, inverted_list_len, inverted_index,
            inverted_list, keys, los, maxenl, threshold)
        jsonline["document"] = {}
        jsonline["document"]["id"] = documentId
        jsonline["document"]["value"] = document_real
        jsonline["entities"] = {}
        for value in return_values_from_c:
            temp = dict()
            temp["start"] = value[1]
            temp["end"] = value[2] + 2
            temp["score"] = value[3]
            value_o = str(value[0])
            try:
                entity_id = entity_realid[value_o]
            except KeyError:
                value_o = value[0]
                entity_id = entity_realid[value_o]
            try:
                jsonline["entities"][entity_id]["candwins"].append(temp)
            except KeyError:
                jsonline["entities"][entity_id] = {}
                jsonline["entities"][entity_id]["value"] = entity_real[value_o]
                jsonline["entities"][entity_id]["candwins"] = [temp]

    return jsonline
def processDoc(line, dicts, config=defualtconfig):
    inverted_list = dicts[0]
    inverted_index = dicts[1]
    entity_tokennum = dicts[2]
    inverted_list_len = dicts[3]
    entity_realid = dicts[4]
    entity_real = dicts[5]
    maxenl = dicts[6]

    threshold = config["threshold"]
    docfileds = config["document"]["value_attribute"]
    n = config["token_size"]

    documentId = line[config["document"]["id_attribute"]]
    document_real = line[docfileds[0]]
    for filed in docfileds[1:]:
        document_real += " " + line[filed]

    jsonline = {}
    document = document_real.lower().strip().replace(" ", "")
    tokens = list(ngrams(document, n))
    heap = []
    keys = []
    los = len(tokens)
    # build the heap
    for i, token in enumerate(tokens):
        key = "".join(token)
        keys.append(key)
        try:
            heap.append([inverted_list[key][0], i])
        except KeyError:
            pass
    if heap:
        return_values_from_c = singleheap.getcandidates(heap, entity_tokennum, inverted_list_len, inverted_index,
                                                     inverted_list, keys, los, maxenl, threshold)
        jsonline["document"] = {}
        jsonline["document"]["id"] = documentId
        jsonline["document"]["value"] = document_real
        jsonline["entities"] = {}
        for value in return_values_from_c:
            temp = dict()
            temp["start"] = value[1]
            temp["end"] = value[2]+2
            temp["score"] = value[3]
            value_o = str(value[0])
            try:
                entity_id = entity_realid[value_o]
            except KeyError:
                value_o = value[0]
                entity_id = entity_realid[value_o]
            try:
                jsonline["entities"][entity_id]["candwins"].append(temp)
            except KeyError:
                jsonline["entities"][entity_id] = {}
                jsonline["entities"][entity_id]["value"] = entity_real[value_o]
                jsonline["entities"][entity_id]["candwins"] = [temp]

    return jsonline
示例#3
0
    def processDoc(self, line):
        inverted_list = self.dictionary[0]
        inverted_index = self.dictionary[1]
        entity_tokennum = self.dictionary[2]
        inverted_list_len = self.dictionary[3]
        entity_realid = self.dictionary[4]
        entity_real = self.dictionary[5]
        maxenl = self.dictionary[6]

        threshold = self.threshold
        n = self.token_size

        document_real = line

        jsonline = {}
        document = document_real.lower().strip()
        tokens = list(ngrams(document, n))
        heap = []
        keys = []
        los = len(tokens)
        # build the heap
        for i, token in enumerate(tokens):
            key = "".join(token)
            keys.append(key)
            try:
                heap.append([inverted_list[key][0], i])
            except KeyError:
                pass
        if heap:
            return_values_from_c = singleheap.getcandidates(heap, entity_tokennum, inverted_list_len, inverted_index,
                                                         inverted_list, keys, los, maxenl, threshold)
            jsonline["document"] = {}
            jsonline["document"]["value"] = document_real
            jsonline["entities"] = {}
            for value in return_values_from_c:
                temp = dict()
                temp["start"] = value[1]
                temp["end"] = value[2]+2
                temp["score"] = value[3]
                value_o = str(value[0])
                try:
                    entity_id = entity_realid[value_o]
                except KeyError:
                    value_o = value[0]
                    entity_id = entity_realid[value_o]
                try:
                    jsonline["entities"][entity_id]["candwins"].append(temp)
                except KeyError:
                    jsonline["entities"][entity_id] = {}
                    jsonline["entities"][entity_id]["value"] = entity_real[value_o]
                    jsonline["entities"][entity_id]["candwins"] = [temp]

        return jsonline
示例#4
0
def run(dictfile,inputfile,dictfileds,docfileds,n=2,threshold=0.8):

	inverted_list = {}
	inverted_index = []
	entity_tokennum = {}
	inverted_list_len = {}
	entity_realid = {}
	entity_real = {}
	i = 0
	maxenl = 0
	for line in open(dictfile):
		line = json.loads(line)
		entity_realid[i] = line["uri"].split("/")[-1]
		entity_real[i] = line["name"]
		for filed in dictfileds:
			entity_real[i] += " "+line[filed.split(".")[0]][filed.split(".")[1]]
		entity = entity_real[i].lower().strip()
		inverted_index.append(entity) # record each entity and its id
		tokens = list(ngrams(entity, n))
		entity_tokennum[entity] = len(tokens) # record each entity's token number
		if maxenl<len(tokens):
			maxenl = len(tokens)
		# build inverted lists for tokens
		tokens = list(set(tokens))
		for token in tokens:
			token = str(token)
			try:
				inverted_list[token].append(i)
				inverted_list_len[token] += 1
			except KeyError:
				inverted_list[token] = []
				inverted_list[token].append(i)	
				inverted_list_len[token] = 1	
		i = i + 1

	for line in open(inputfile):
		line = json.loads(line)
		documentId = line["uri"].split("/")[-1]
		document_real = line["name"]
		for filed in docfileds:
			document_real += " "+line[filed.split(".")[0]][filed.split(".")[1]]
		#tokenize document, add inverted list(empty) of new tokens in document
		document = document_real.lower().strip()
		jsonline = {}
		jsonline["document"] = {}
		jsonline["document"]["id"] = documentId
		jsonline["document"]["value"] = document_real
		jsonline["entities"] = {}
		tokens = list(ngrams(document, n))
		heap = []
		keys = []
		los = len(tokens)
		# build the heap
		for i, token in enumerate(tokens):
			key = str(token)
			keys.append(key)
			try:
				heap.append([inverted_list[key][0],i])
			except KeyError:
				pass
		if heap != []:
			returnValuesFromC = singleheap.getcandidates(heap,entity_tokennum,inverted_list_len,inverted_index,inverted_list,keys,los,maxenl)
			for value in returnValuesFromC:
				temp = {}
				temp["start"] = value[1]
				temp["end"] = value[2]
				temp["score"] = value[3]
				try:
					jsonline["entities"][entity_realid[value[0]]]["candwins"].append(temp)
				except KeyError:
					jsonline["entities"][entity_realid[value[0]]] = {}
					jsonline["entities"][entity_realid[value[0]]]["value"] = entity_real[value[0]]
					jsonline["entities"][entity_realid[value[0]]]["candwins"] = [temp]
		print json.dumps(jsonline)