def getWordAggregationFromFile(fileName, word, entityIndex=None, folder="Texts"): """ This function gets a word tf or tf-idf from file, this is done by setting the entity index value ( case of tf-idf) or none ( case of tf) NB: word in parameter should be a word from vocabulary, if word not in file the funciton returns 0 as it coefficient This function uses the database file to map entityIndex and entity row by row Parameters: :param fileName :param word: :param entityIndex: if an entity index is provide or None for all entities :param folder: folder where to get the file (fileName) """ fileFrame = readDataFile(fileName, folder) if entityIndex is None: rowInfo = fileFrame.loc[fileFrame["word"] == word] if rowInfo.empty: return 0 else: return float(rowInfo.loc[:, "tf"].values.tolist()[0]) elif entityIndex is not None: rows, cols = fileFrame.shape row = 0 rowInfo = fileFrame.loc[(fileFrame["word"] == word) & (fileFrame["entity"] == entityIndex)] if rowInfo.empty: return 0 else: return float(rowInfo[-1])
def numberOfImpEntity(wordImpCSV, wordImpCSVFolder): dataFrame = readDataFile(wordImpCSV, wordImpCSVFolder) df = dataFrame['entity'].nunique() print("### entity groups") print(df) print("###") return int(df)
def analysisValues(csvKB, csvKBFolder): dataFrame = readDataFile(csvKB, csvKBFolder) print("### number of entities ") numberOfEntities, cols = dataFrame.shape print(numberOfEntities) print("###") print("### missing values ") numberOfMissingValue = (dataFrame == '').sum(axis=1).sum(axis=0) print(numberOfMissingValue) print("###") return numberOfEntities, numberOfMissingValue
def createStopListFromFile(fileName, columnName=None, by="column", folder="Texts"): """ This function is used to create a list of stopwords from a given column or columns from a data file ( .csv) Parameters: :param fileName: is the csv file name :param columnName: is the colomn/columns of the csv file to be used to get the stop words :param by: defins the how the csv will be read """ df = readDataFile(fileName, folder) words = df[columnName].values.tolist() return set(words)
def checkIfEntityInDataset(entityURI, entityAttributeName, datasetFile, datasetFileFolder="Outputs"): df = readDataFile(datasetFile, datasetFileFolder) listAttrib = list(df[entityAttributeName]) listAttribute = [str(item) for item in listAttrib] if len(listAttribute) >= 1: if str(entityURI) in listAttribute: return True # elif int(entityURI) in listAttribute: # return True else: return False else: return False
def getAttributeVector(myModel, dataBaseFile, entity, entityProperty=None, dataBaseFolder="Texts"): """ This function get a Parameters: :param myModel: the embedding model from the corpus :param dataBaseFile: The database file name (csv fromat) :param entity: the URI of the entity we are interested on :param entityProperty: a given property/list of properties of the entity. :param folder: is the name of the folder present in the data folder and containing the database file use by this function. It returns the vecteur representing the entity from the embedding """ df = readDataFile(dataBaseFile, dataBaseFolder) listOfColumns = list(df.columns) rows, cols = df.shape myModel = os.path.join(MODEL, myModel) model = Word2Vec.load(myModel) vectorSize = model.vector_size modelVocabulary = list(model.wv.vocab.keys()) print("Shape of data frame: ", df.shape) dataBaseRow = 0 meetEntity = False # search for the entity in the dataBase file df = df.applymap(str) rowInfo = df.loc[df["entity"] == str(entity)] if not rowInfo.empty: dataBaseRow = rowInfo.index.values[0] listRow = df.iloc[dataBaseRow, :] # if entity in dataBaseFile if dataBaseRow in range(rows) and isinstance( entityProperty, str) and entityProperty in listOfColumns: try: colIndex = listOfColumns.index(entityProperty) print("Property index", colIndex) attributeVector = {} if listRow[colIndex]: attribute, attributeSize = createVocabulary( stoplist, listRow[colIndex]) attributeVocabulary = {} for attr in attribute: if attr in modelVocabulary: print("# ", attr) attributeVocabulary[attr] = model[attr] attributeVector[entityProperty] = attributeVocabulary print("Attribute:", attributeVector) return vectorSize, attributeVector else: return vectorSize, attributeVector except: print("PROPERTY : ", entityProperty, "NOT IN DATABASE") elif dataBaseRow in range(rows) and isinstance(entityProperty, list): listOfAttributesVectors = [] try: for propertyInList in entityProperty: if propertyInList in listOfColumns: colIndex = listOfColumns.index(propertyInList) print("Property index ", colIndex) attributeVector = {} if listRow[colIndex]: attribute, attributeSize = createVocabulary( stoplist, listRow[colIndex]) attributeVocabulary = {} for attr in attribute: print("# ", attr) if attr in modelVocabulary: attributeVocabulary[attr] = model[attr] attributeVector[ propertyInList] = attributeVocabulary listOfAttributesVectors.append(attributeVector) print(listOfAttributesVectors) return vectorSize, listOfAttributesVectors except: print("PROPERTY : ", entityProperty, "NOT IN DATABASE") else: attributeVector = {} return vectorSize, attributeVector
def completeSimilarityOfDatasets(corpusEmbeddedModel, model, dataBaseFileOne, frequencyModelFileOne, dataBaseFileTwo, frequencyModelFileTwo, properties=None, modelFolder="Models", dataBaseFolder="Texts", frequencyFolder="Outputs"): cEModel = corpusEmbeddedModel.split("_") """ This function takes two datasets(csv format) and returns a file containing cross similarity of all their entities. Parameters: :param corpusEmbeddedModel: The trained model from the corpus. :param model: is the model being used (tf/idf/tfidf). :param dataBaseFileOne: is the first database CSV file. :param dataBaseFileTwo: is the second database CSV file. :param frequencyModelFileOne/frequencyModelFileTwo frequency model of first database and the second database respectively. :param modelFolder: is the folder containing the trained model from the corpus(corpusEmbeddedModel) """ listOfVectorsGraphOne = [] listOfVectorsGraphTwo = [] dfOne = readDataFile(dataBaseFileOne, dataBaseFolder) dfTwo = readDataFile(dataBaseFileTwo, dataBaseFolder) rowsOne, colsOne = dfOne.shape rowsTwo, colsTwo = dfTwo.shape listOfAttributs = properties fileOne = dataBaseFileOne.split(".csv") fileTwo = dataBaseFileTwo.split(".csv") if properties is None: listOfAttributs = LISTOFPROPERTIES elif isinstance(properties, list): listOfAttributs = properties elif isinstance(properties, str): listOfAttributs = [properties] outputCombineFile = "distancesCrossSimilarity_" + "CorpusModel_" + cEModel[ 1] + "_win_" + cEModel[4] + "_vec_" + cEModel[ 6] + "_attribute_" + "-".join( listOfAttributs) + "_weight_" + model + "_" + str( datetime.now()).replace(":", "").replace("-", "").replace( " ", "").split(".")[0] + ".csv" characteristicCombineFile = open(os.path.join(OUTPUT, outputCombineFile), "a+") characteristicCombineFile.write("\t".join( [fileOne[0], fileTwo[0], "euclidean", "cosine"])) characteristicCombineFile.write("\n") characteristicCombineFile.close() for indexOne in range(rowsOne): listRowOne = dfOne.iloc[indexOne, :] print("### listRowOne") print(listRowOne["description"]) print("###") for indexTwo in range(rowsTwo): listRowTwo = dfTwo.iloc[indexTwo, :] print("### listRowTwo") print(listRowTwo["entity"]) print("###") vectorSizeOne, attributeVectorOne = getAttributeVector( corpusEmbeddedModel, dataBaseFileOne, str(listRowOne["entity"]), listOfAttributs, dataBaseFolder) vectorSizeTwo, attributeVectorTwo = getAttributeVector( corpusEmbeddedModel, dataBaseFileTwo, str(listRowTwo["entity"]), listOfAttributs, dataBaseFolder) entityVectorOne = usableAttributeVector(frequencyModelFileOne, model, str(listRowOne["entity"]), attributeVectorOne, vectorSizeOne, frequencyFolder) entityVectorTwo = usableAttributeVector(frequencyModelFileTwo, model, str(listRowTwo["entity"]), attributeVectorTwo, vectorSizeTwo, frequencyFolder) # consider only non zero vectors if not np.array_equal( entityVectorOne, np.zeros(vectorSizeOne, dtype="float64") ) and not np.array_equal(entityVectorTwo, np.zeros(vectorSizeTwo, dtype="float64")): listOfVectorsGraphOne.append(entityVectorOne) listOfVectorsGraphTwo.append(entityVectorTwo) euclideanDistance, cosineDistance = computeSimilarity( entityVectorOne, entityVectorTwo) print(str(listRowOne["entity"]), " - ", str(listRowTwo["entity"]), " == ", euclideanDistance, cosineDistance) characteristicCombineFile = open( os.path.join(OUTPUT, outputCombineFile), "a+") characteristicCombineFile.write("\t".join([ str(listRowOne["entity"]), str(listRowTwo["entity"]), str(euclideanDistance), str(cosineDistance) ])) characteristicCombineFile.write("\n") characteristicCombineFile.close() return "Outputs", outputCombineFile
def usableAttributeVector(frequencyModelFile, model, entity, attributeVector, vectorSize, frequencyModelFolder="Outputs"): """ This funciton returns a usable vector of an entity from a given database file. Parameters: :param frequencyModelFile: is the csv file containing words and their frequencies (tf/idf/tfidf) :param model : is the model being used (tf/idf/tfidf) :param entity : is the URI of the entity we are look for it vector :param attributeVector : a list of dictionary returned from getAttributeVector and containing relevent words from attribute of an entity :param vectorSize: is the size of word vector from the embedding model This function returns a vector given the dictionary of an attribute with dictionary vectors of the key words that constitute them. """ print("### attributeVector") print(attributeVector) print("###") frequencyDataFrame = readDataFile(frequencyModelFile, frequencyModelFolder) if model == "idf" or model == "IDF": modelValue = vocabCount.idf_ modelVocabulary = countMatrix.get_feature_names() elif model in ["TF-IDF", "tf-idf", "TFIDF", "tfidf", "TF", "tf"]: allEntityDataFrame = frequencyDataFrame.loc[:, "entity"] = entity print("### allEntityDataFrame") print(allEntityDataFrame) print("###") entityDataFrame = frequencyDataFrame.loc[ frequencyDataFrame.loc[:, "entity"] == entity, :] print("### entity frame") print(entityDataFrame) print("###") listOfWords = entityDataFrame.loc[:, "word"].values print("### list of words") print(listOfWords) print("###") if attributeVector and isinstance(attributeVector, dict): sumVector = np.zeros(vectorSize, dtype="float64") for word in listOfWords: for attribute in attributeVector: print("###") print("attribute", attribute) v = np.zeros(vectorSize, dtype="float64") if word in attributeVector[attribute]: print("word", word) v = np.array(attributeVector[attribute][word], dtype="float64") coef = entityDataFrame.loc[ entityDataFrame.loc[:, "word"] == word].values print("coefficient value", coef[0, 2]) v = v * coef[0, 2] print("### vectore multiply by coef") print(v) print("###") sumVector += v print("###") return sumVector elif attributeVector and isinstance(attributeVector, list): finalVector = np.zeros(vectorSize, dtype="float64") for attribute in attributeVector: finalVector += usableAttributeVector(frequencyModelFile, model, entity, attribute, vectorSize) return finalVector else: return None
def evaluation(groundFile, groundColumnName, resultFile, resultColumnName, threshold=None, distance=None, groundFileFolder="Outputs", resultFileFolder="Outputs", plot=False): """ This funciton takes the groud file and the result file, and returns for a percentage of correct match entities from the ground file. It does this for each distances used in the result file. Parameters: :param groundFile: is the ground truth file name containing the matches of entities from both knowledge based files :param groundColumnName: the column names (02) corresponding to the matches. :param resultFile: is the result file from cross calculations of distances :param resultColumnName: is the column of interes from the result file :param threshold: the value that the distances should satisfied. default -> None :param distance: is the type of distance been used. default -> None 1 -> euclidean 2 -> cosine :param plot: states if the threshold-precision graph should be ploted True -> plot graph False -> do not plot graph """ groundFrame = readDataFile(groundFile, groundFileFolder) groundRows, groundCols = groundFrame.shape resultFrame = readDataFile(resultFile, resultFileFolder) resultRows, resultCols = resultFrame.shape extractedGround = groundFrame[groundColumnName] distanceInfo = resultFile.split("_") countMatch = 0 outputevaluationFile = "evaluation" + str(datetime.now()).replace( ":", "").replace("-", "").replace(" ", "").split(".")[0] + ".txt" f = open(os.path.join(OUTPUT, outputevaluationFile), "a+") f.write("Ground file \n") f.write(groundFile) f.write("\n") f.write("Result file \n") f.write(resultFile) f.write("\n") f.write("Corpus Model \n") f.wrtite(distanceInfo[2]) f.write("Corpus Model window size \n") f.write(distanceInfo[4]) f.write("Corpus Model vector dimension \n") f.write(distanceInfo[6]) f.write("Corpus Model attribute \n") f.write(" ".join(distanceInfo[8].split("-"))) f.write("Weight coef \n") f.write(distanceInfo[10]) f.write("\n") f.close() if isinstance(threshold, int) or isinstance(threshold, float): for index, row in extractedGround.iterrows(): couple = [row[groundColumnName[0]], row[groundColumnName[1]]] print("### groud couple") print(couple) print("###") matchFrame = resultFrame[resultFrame[resultColumnName[0]] == couple[0]] matchValues = matchFrame.values if not matchFrame.empty and matchValues[0][1] and matchValues[0][ 1] == couple[1] and matchValues[0][distance + 1] >= threshold: countMatch += 1 print("### countMatch") print(countMatch) print("###") print("### matchFrame") print(matchFrame.values) print("###") f = open(os.path.join(OUTPUT, outputevaluationFile), "a+") f.write("Recall: \n") recall = countMatch / groundRows f.write(str(recall)) f.write("\n") f.write("Precision: \n") precision = countMatch / resultRows f.write(str(precision)) f.close() return precision, recall elif isinstance(threshold, list) and plot == True: listOfPrecision = [] print("### list of threshold") print(threshold) print("###") for th in threshold: print("### th in threshold") print(th) print("###") print() prec, rec = evaluation(groundFile, groundColumnName, resultFile, resultColumnName, th, distance, groundFileFolder, resultFileFolder, False) listOfPrecision.append(prec) print("### listOfPrecision") print(listOfPrecision) print("###") fig = plt.figure() plt.plot(threshold, listOfPrecision, 'ro') plt.axis([0, max(threshold), 0, 1]) # plt.show() fig.savefig( os.path.join( OUTPUT, "evaluation" + "_plot_" + str(datetime.now()).replace( ":", "").replace("-", "").replace(" ", "").split(".")[0] + ".png"))