示例#1
0
def getEntityFile():
    from px_aux import ORIGINAL_TEXTS_FOLDER as _ORIGINAL_TEXTS_FOLDER

    fich = request.values.get('file')
    sfile = _ORIGINAL_TEXTS_FOLDER + "/" + fich
    pfile = sfile + ".p"

    content_file = _Open(sfile, "rt", encoding='utf8')
    content = content_file.read()
    content_file.close()

    fileDataMod = pickle.load(_Open(pfile, "rb"))

    result = {'text': content, 'data': fileDataMod}
    return jsonify(result)
示例#2
0
def getFile():
    from px_aux import ORIGINAL_TEXTS_FOLDER as _ORIGINAL_TEXTS_FOLDER

    fich = request.values.get('file')
    content_file = _Open(_ORIGINAL_TEXTS_FOLDER + "/" + fich,
                         "rt",
                         encoding='utf8')
    content = content_file.read()
    content_file.close()

    if fich.endswith(".txt"):
        head = fich + ": fichero original"
    elif fich.endswith(".s.html"):
        head = fich + ": fichero donde se han añadido sufijos romanos a partir del contexto (<span style='color: green'>marcados en verde</span>)"
    elif fich.endswith(".s.nr.html"):
        head = fich + ": informe sobre las surface forms (<span style='color: red'>marcadas en rojo</span>) a las que se estudió añadir un sufijo romano y finalmente se descartó"
    elif fich.endswith(".s"):
        head = fich + ": fichero de partida, donde ya se han añadido sufijos romanos a partir del contexto"
    elif fich.endswith(".p.html"):
        head = fich + ": fichero de partida, donde se han marcado las entidades detectadas"
    elif fich.endswith(".w.html"):
        head = fich + ": fichero donde se han realizado las transformaciones de la surface form (<span style='color: blue'>en azul tachado</span>) a la última parte del URL de la entidad que le corresponde (<span style='color: green'>en verde</span>)"
    else:
        head = fich + ": fichero con propósito desconocido"

    if fich.endswith(".txt") or fich.endswith(".s"):
        content = content.replace("\n", "<p>")

    result = {'head': head, 'text': content}
    return jsonify(result)
示例#3
0
	def euclideanTextSimilarity (self, candidate_text=None, candidate_file=None):
		try:
			if not candidate_text:
				candidate_fileFD = _Open(candidate_file, "r")
				candidate_text = candidate_fileFD.read()

			list_of_text = [self.original_text, candidate_text]   # Create a list of documents of the original text and the new candidate text

			vectorizer = CountVectorizer()   # Create a CountVectorizer Object

			# Transform arbitrary data into numerical features
			# Description: remove stopwords, tokenize the text, create a vocabulary from distinct words, map each document to vocabulary (tokens)
			features = vectorizer.fit_transform(list_of_text).todense()

			# Measure the euclidean distance, returns an array with the euclidean distance
			euclideanDistances = euclidean_distances(features[0], features[1])

			euclidean_distance = euclideanDistances[0][0]   # between 0 and N, 0 is the best

			euclidean_similarity = 1 / (1 + euclidean_distance) # between 0 and 1, 1 is the best
		except Exception as e:
			print("** ERROR euclideanTextSimilarity:", str(e))
			raise e

		return euclidean_similarity
	def __iter__(self):
		print("MyCorpusBoth iteration", numIter)

		startTime = datetime.now()
		print("Generic files")

		for i,file in enumerate(listGeneric):
			if (i % 100) == 0:
				print(i, end=' ', flush=True)

			with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as fdata:
				text = fdata.read()
				# remove stopwords
				cText = Gensim_remove_stopwords(text)
				# preprocess the text (tokenize, lower, remove punctuation, remove <2 and >50 length words)
				wordsText = simple_preprocess(cText, max_len=50)
				# feed the caller
				yield TaggedDocument(words=wordsText, tags=dictTags[file])

		print("\nAd hoc files")

		for file in listCorpusAH:
			fd = _Open(file, "r")
			text = fd.read()
			# remove stopwords
			cText = Gensim_remove_stopwords(text)
			# preprocess the text (tokenize, lower, remove punctuation, remove <2 and >50 length words)
			wordsText = simple_preprocess(cText, max_len=50)
			# feed the caller
			yield TaggedDocument(words=wordsText, tags=dictTags[file])

		endTime = datetime.now()
		elapsedTime = endTime - startTime
		print(" Duración:", elapsedTime.seconds)
示例#5
0
def setStoredQA():
    if request.method == "POST":
        print(request.values)
        QAsString = request.values.get("value")
        listQATuplas = re.findall("(.*)\|(.*)", QAsString)
        pickle.dump(listQATuplas, _Open("storedQA.p", "wb"))
        return jsonify(listQATuplas)
示例#6
0
	def sharedSubjectsJaccardSimilarity (self, fileNameCandidateSubjects):

		try:  # try to read candidate text subjects from local DB
			with _Open(fileNameCandidateSubjects) as fp:
				candidate_text_subjects = fp.read().splitlines()
		except Exception as e:
			_Print("Candidate subjects file not found in local DB:", fileNameCandidateSubjects)
			_appendFile(self.logFilename, "ERROR sharedSubjectsJaccardSimilarity(): Candidate subjects file not found: "+fileNameCandidateSubjects+" "+str(e))
			return -1

		if len(candidate_text_subjects) == 0:
			return 0

		# the subjects lists for both texts are now available
		subjects_jaccard_similarity = 0

		try:
			# change every candidate subject by the pair (subject, list of subject components)
			pairs_candidate_text_subjects = list(map(lambda x: (x, _getSubjectComponents(x)), candidate_text_subjects))

			numContributions=0  # number of matches - contributions with some similarity
			sum_sims = 0  # to aggregate similarities contributions

			for (sbo,sbocl) in self.pairs_original_text_subjects:
				for (sbc,sbccl) in pairs_candidate_text_subjects:
					min_long = min(len(sbocl), len(sbccl)) # length of the shorter subject

					if (min_long < 3):  # both subjects must have at least 3 components
						continue

					intersection_cardinality = len(set.intersection(set(sbocl), set(sbccl)))

					# for the shorter subject, we require at most 1 component not to be included in the larger subject
					if (intersection_cardinality < (min_long - 1)):
						continue

					# this fulfills the requirements: it is a contribution

					numContributions += 1
					union_cardinality = len(set.union(set(sbocl), set(sbccl)))
					component_jaccard_similarity = intersection_cardinality/float(union_cardinality)
					sum_sims += component_jaccard_similarity
					_Print(numContributions, "->", sbo, ",", sbc, component_jaccard_similarity)

					if numContributions == 0: # no intersection at all
						return 0

					subjects_jaccard_similarity = sum_sims / numContributions
		except Exception as e:
			_Print("ERROR sharedSubjectsJaccardSimilarity(): Exception while computing Jaccard subjects similarity: "+str(e))
			_appendFile(self.logFilename, "ERROR sharedSubjectsJaccardSimilarity(): Exception while computing Jaccard subjects similarity: "+str(e))
			return -1

		if subjects_jaccard_similarity > 1:
			_Print("Candidate with subjects similarity > 1:", fileNameCandidateSubjects, sum_sims, denominator, subjects_jaccard_similarity)
			_appendFile(self.logFilename, "ERROR sharedSubjectsJaccardSimilarity(): similarity > 1")
			return -1

		return subjects_jaccard_similarity
示例#7
0
	def spacyTextSimilarity_calc (self, candidate_text=None, candidate_file=None):
		if not candidate_text:
			candidate_fileFD = _Open(candidate_file, "r")
			candidate_text = candidate_fileFD.read()

		candidate_text_doc_tokens = self.nlp(candidate_text)

		return self.compute_similarity_without_stopwords_punct(self.original_text_doc_tokens, candidate_text_doc_tokens)
示例#8
0
    def htmlFolderToText(self, folderPath):
        list_of_files = glob.glob(folderPath + "*.html")

        for html_file in list_of_files:
            file = _Open(html_file, "r")
            html = file.read()

            pageName, cleanedText = htmlToText(html)
示例#9
0
    def sharedSubjectsSimilarity(self, original_text_subjects,
                                 fileNameCandidateSubjects, logFilename):

        try:  # try to read candidate text subjects from local store
            with _Open(fileNameCandidateSubjects) as fp:
                candidate_text_subjects = fp.read().splitlines()
                print("File already available in local DB:",
                      fileNameCandidateSubjects)
        except:  # fetch candidate text subjects if not in local store
            _appendFile(
                logFilename,
                "ERROR sharedSubjectsSimilarity(): Subjects file not available: "
                + fileNameCandidateSubjects)
            return -1

        if len(candidate_text_subjects) == 0:
            _appendFile(
                logFilename,
                "ERROR sharedSubjectsSimilarity(): Subjects file empty: " +
                fileNameCandidateSubjects)
            return -1

        # the subjects lists for both texts are now available

        try:
            # change every original subject by the pair (subject, list of subject components)    NONSENSE to compute this every time
            pairs_original_text_subjects = list(
                map(lambda x: (x, _getSubjectComponents(x)),
                    original_text_subjects))

            # change every candidate subject by the pair (subject, list of subject components)
            pairs_candidate_text_subjects = list(
                map(lambda x: (x, _getSubjectComponents(x)),
                    candidate_text_subjects))

            sum_sims = 0
            for (wko, wkocl) in pairs_original_text_subjects:
                for (wkc, wkccl) in pairs_candidate_text_subjects:
                    wkc_jaccard_similarity = self.measures.oJaccardSimilarity(
                        wkocl, wkccl)
                    sum_sims += wkc_jaccard_similarity

            union_cardinality = len(
                set.union(set(original_text_subjects),
                          set(candidate_text_subjects)))

            if union_cardinality == 0:  # not possible, it should not be here if len(original_text_subjects) == 0
                return -1
            else:
                subjects_jaccard_similarity = sum_sims / union_cardinality
        except Exception as e:
            _appendFile(
                logFilename,
                "ERROR sharedSubjectsSimilarity(): Exception while computing Jaccard subjects similarity: "
                + e)
            return -1

        return subjects_jaccard_similarity
示例#10
0
	def spacyTextSimilarity (self, candidate_text=None, candidate_file=None):
		if not candidate_text:
			candidate_fileFD = _Open(candidate_file, "r")
			candidate_text = candidate_fileFD.read()

		# Tokenize candidate text based on the spacy package
		candidate_text_doc_tokens_without_stopwords = self.nlp(self.remove_spacy_stopwords(candidate_text))

		# Measure both texts similarity with spaCy method and return it
		return self.original_text_doc_tokens_without_stopwords.similarity(candidate_text_doc_tokens_without_stopwords)
示例#11
0
文件: px_aux.py 项目: agilll/Plethora
def getContentMarked(filename, type):

    file = _Open(filename, 'r')
    content = file.read()

    pfilename = filename + ".p"

    if not os.path.isfile(pfilename):
        print("Does not exist " + pfilename)
        return content

    pfile = _Open(pfilename, 'rb')
    dics = pickle.load(pfile)

    dicOffsets = dics["byOffset"]

    finalHTMLContent = ""
    currentPosition = 0

    # iteration follows the input order in the dictionary, that is supposed to be the offset order, increasing
    for k in dicOffsets:
        entity = dicOffsets[k]
        text = content[currentPosition:int(k)]
        currentPosition += len(text)

        finalHTMLContent += text.replace("\n", "\n<br>")

        urlEntity = entity["@URI"]

        if type == "s":
            name = entity["@surfaceForm"]
        else:
            name = entity["entityName"]

        finalHTMLContent += "<a href='" + urlEntity + "?lang=en'>" + name + "</a>"
        currentPosition += len(name)

    return finalHTMLContent
示例#12
0
	def __iter__(self):
		global numIter
		print("MyCorpusAH iteration", numIter)
		numIter = numIter + 1

		for file in listCorpusAH:

			fd = _Open(file, "r")
			text = fd.read()
			# remove stopwords
			cText = Gensim_remove_stopwords(text)
			# preprocess the text (tokenize, lower, remove punctuation, remove <2 and >50 length words)
			wordsText = simple_preprocess(cText, max_len=50)
			# feed the caller
			yield TaggedDocument(words=wordsText, tags=dictTags[file])
示例#13
0
def getTrainingTexts():
    from pp_app import historicalDBpediaDataMod as _historicalDBpediaDataMod
    from px_aux import TEXTS_FOLDER as _TEXTS_FOLDER, DEFAULT_TRAINING_TEXTS as _DEFAULT_TRAINING_TEXTS

    result = {}
    for f in os.listdir(_TEXTS_FOLDER):
        #if f.endswith("w"):
        if f == _DEFAULT_TRAINING_TEXTS:
            with _Open(_TEXTS_FOLDER + "/" + f, "rt",
                       encoding='utf8') as content_file:
                content = content_file.read()
                content_file.close()
            result[f] = {'text': content, 'data': _historicalDBpediaDataMod}

    return jsonify(result)
示例#14
0
	def fullSubjectsJaccardSimilarity (self, fileNameCandidateSubjects):

		try:  # try to read candidate text subjects from local DB
			with _Open(fileNameCandidateSubjects) as fp:
				candidate_text_subjects = fp.read().splitlines()
		except Exception as e:
			_Print("Candidate subjects file not found in local DB:", fileNameCandidateSubjects)
			_appendFile(self.logFilename, "ERROR fullSubjectsJaccardSimilarity(): Candidate subjects file not found: "+fileNameCandidateSubjects+" "+str(e))
			return -1

		if len(self.original_text_subjects) == 0 or len(candidate_text_subjects) == 0:
			return 0

		subjects_jaccard_similarity = self.oMeasures.oJaccardSimilarity(self.original_text_subjects, candidate_text_subjects)

		return subjects_jaccard_similarity
示例#15
0
	def doc2VecTextSimilarity (self, candidate_text=None, candidate_file=None):

		if not candidate_text:
			candidate_fileFD = _Open(candidate_file, "r")
			candidate_text = candidate_fileFD.read()

		if (self.remove_stopwords == True):
			candidate_text = remove_stopwords(candidate_text)

		# Use gensim.utils.simple_preprocess for processing:
		# tokenize text to individual words, remove punctuations, set to lowercase, and remove words less than 2 chars or more than 50 chars
		candidate_text_tokens  = simple_preprocess(candidate_text, max_len=50)

		# infer_vector(): Generates a vector from a document
		# The document should be tokenized in the same way the model's training documents were tokenized
		# The function may accept some optional parameters (alpha, min_alpha, epochs, steps)

		# infer_vector(doc_words, alpha=None, min_alpha=None, epochs=None, steps=None)
		# doc_words (list of str) – A document for which the vector representation will be inferred.
		# alpha (float, optional) – The initial learning rate. If unspecified, value from model initialization will be reused.
		# min_alpha (float, optional) – Learning rate will linearly drop to min_alpha over all inference epochs. If unspecified, value from model initialization will be reused.
		# epochs (int, optional) – Number of times to train the new document. Larger values take more time, but may improve quality and run-to-run stability of inferred vectors. If unspecified, the epochs value from model initialization will be reused.
		# steps (int, optional, deprecated) – Previous name for epochs, still available for now for backward compatibility: if epochs is unspecified but steps is, the steps value will be used.

		# Generate a vector from the tokenized candidate text
		candidate_text_inferred_vector = self.model.infer_vector(candidate_text_tokens, epochs=50)

		# The sklearn math functions returns an array with the results
		# We shall keep only one of them, either sklearn or ourSimilarityListsFunctions

		# Measure vectors similarity using cosine similarity
		# cos_similarity = cosine_similarity([original_text_inferred_vector], [text_inferred_vector])

		# Measure vectors similarity using euclidean distance
		# euc_distance = euclidean_distances([original_text_inferred_vector], [text_inferred_vector])

		# Measure vectors similarity using cosine similarity
		cos_similarity = self.ourMeasures.oCosineSimilarity(self.original_text_inferred_vector, candidate_text_inferred_vector)

		# Measure vectors similarity using euclidean distance
		# euc_distance = self.ourMeasures.oEuclideanDistance(self.original_text_inferred_vector, candidate_text_inferred_vector)

		# Measure vectors similarity using manhattan distance
		# man_distance = self.ourMeasures.oManhattanDistance(self.original_text_inferred_vector, candidate_text_inferred_vector)

		return cos_similarity
示例#16
0
ipctg = int(pctg)

MODEL_NAME = "/Users/agil/CloudStation/KORPUS/hibrido_"+TRCORPUS+"_"+pctg+".model"
AH_CORPUS_FOLDER = "/Users/agil/CloudStation/KORPUS/SCRAPPED_PAGES/"

GEN_FOLDER_BASE = "/Users/agil/Downloads/_corpus/"
GEN_FOLDER = GEN_FOLDER_BASE+TRCORPUS+"/"

numIter=0

# read the first 8% from 1926.ph5-3.simsBest.csv: the list of ad hoc candidates
listBestAPFilename = "/Users/agil/CloudStation/KORPUS/1926/1926.ph5-3.simsBest.csv"
listAP = []

try:
    with _Open(listBestAPFilename, 'r') as csvFile:
        reader = csv.reader(csvFile, delimiter=' ')
        next(reader)  # to skip header
        for row in reader:
            listAP.append(AH_CORPUS_FOLDER+row[0])
        csvFile.close()
except Exception as e:
	print("Exception reading csvFile:", listBestAPFilename, str(e))

sizeAHCorpus = int(len(listAP) / 100) *  ipctg   # 8% of the total candidates
listCorpusAH = listAP[:sizeAHCorpus]

print("Size corpus AH =", sizeAHCorpus)


示例#17
0
def getWikicatsFromText():
    if request.method == "POST":
        originalText = request.values.get("text")
        len_text = len(originalText)  # length of the received text

        if not os.path.exists(
                _CORPUS_FOLDER):  # create KORPUS folder if not exists
            os.makedirs(_CORPUS_FOLDER)

        filename = _CORPUS_FOLDER + "/" + str(
            len_text
        ) + ".txt"  # save the received text with length.txt filename
        _saveFile(filename, originalText)

        filename_wk = _CORPUS_FOLDER + "/" + str(
            len_text) + ".wk"  # filename for wikicats (length.wk)
        filename_sb = _CORPUS_FOLDER + "/" + str(
            len_text) + ".sb"  # filename for subjects (length.sb)

        result = {}

        try:  # open wikicats file if exists
            with _Open(filename_wk) as fp:
                listWikicats = fp.read().splitlines()
                result["wikicats"] = listWikicats
        except:  # fetch wikicats if file does not exist yet
            result = _getCategoriesInText(
                originalText
            )  # function getCategoriesInText from px_DB_Manager.py

            if ("error" in result):  # return error if could not fetch wikicats
                return jsonify(result)

            listWikicats = list(
                filter(_filterSimpleWikicats, result["wikicats"])
            )  # remove simple wikicats with function from aux.py
            result[
                "wikicats"] = listWikicats  # update result wikicats to return

            _saveFile(filename_wk, '\n'.join(listWikicats)
                      )  # save file (length.wk) with wikicats, one per line

            listSubjects = list(
                filter(_filterSimpleWikicats, result["subjects"])
            )  # remove simple subjects with function from aux.py
            result[
                "subjects"] = listSubjects  # update result subjects to return

            _saveFile(filename_sb, '\n'.join(listSubjects)
                      )  # save file (length.sb) with subjects, one per line

        for w in listWikicats:  # compute components for every wikicat and add all of them to result
            wlc = _getWikicatComponents(
                w)  # function getWikicatComponets from aux.py
            result[w] = wlc  # one entry per wikicat

        filename_selected = _CORPUS_FOLDER + "/" + str(
            len_text
        ) + ".selected.wk"  # previously selected wikicats file for this text

        try:  # try to open previously selected wikicats file if exists
            with _Open(filename_selected) as fp:
                wkSelectedList = fp.read().splitlines()
        except:
            wkSelectedList = []  # no previously selected wikicats

        result["formerSelectedWikicats"] = wkSelectedList

        return jsonify(result)
示例#18
0
def saveFile(f, content):
    out = _Open(f, 'w')
    out.write(content)
    out.close()
    return
示例#19
0
def getUrlsLinked2Wikicats(selectedWikicats, logFilename):
    requestObjects = {}  # dictionary to store request objects

    _session = FuturesSession()  # to manage asynchronous requests

    # first phase, reading files or start requests for DBpedia and Wikidata foreach wikicat

    for wikicat in selectedWikicats:

        # first, read or fetch Wikicat results for DBpedia

        filename_db = _URLs_FOLDER + "/_Wikicat_" + wikicat + "_DB_Urls.txt"
        requestDone = 0  # to control if some request has been done, and if so, set a delay to not overload servers

        try:  # try to read wikicats of original text from local store
            with _Open(filename_db) as fp:
                urls_from_DB = fp.read().splitlines()
                print("File already available:", filename_db)
                requestObjects[wikicat] = {
                    "dburls": urls_from_DB
                }  # store the local available DB URLs for this wikicat
        except:  # fetch data from DB
            fullWikicat = "Wikicat" + wikicat

            # asynchronous query to dbpedia
            # request only URLs being primaruy topic of some dbpedia entity
            queryDB = """
			PREFIX yago: <http://dbpedia.org/class/yago/>
			SELECT ?url ?der ?pt WHERE {
				?url  rdf:type yago:""" + fullWikicat + """ .
				OPTIONAL {?url  prov:wasDerivedFrom ?der}
				OPTIONAL {?url  foaf:isPrimaryTopicOf ?pt}
			}
			"""

            # start the DB query
            try:
                print("Starting DB query for: ", wikicat)
                requestDB = _session.post(
                    _URL_DB,
                    data={"query": queryDB},
                    headers={"accept": "application/json"})
            except Exception as exc:
                print(
                    "*** ERROR getUrlsLinked2Wikicats(): Error starting DB query for",
                    wikicat, ":", exc)
                _appendFile(
                    logFilename,
                    "ERROR getUrlsLinked2Wikicats(): Error starting DB query for "
                    + wikicat + ": " + repr(exc))
                requestDB = None

            requestObjects[wikicat] = {
                "db": requestDB
            }  # store the request DB object for this wikicat
            requestDone = 1

        # now, read or fetch Wikicat results for Wikidata

        filename_wk = _URLs_FOLDER + "/_Wikicat_" + wikicat + "_WK_Urls.txt"

        # it uses update with the objects dictionary, as the wikicat key has been already created for DBpedia

        wcs = _getWikicatComponents(wikicat)
        wcs_string = " ".join(wcs)

        try:  # try to read wikicats and subjects of original text from local store
            with _Open(filename_wk) as fp:
                urls_from_WK = fp.read().splitlines()
                print("File already available:", filename_wk)
                requestObjects[wikicat].update({
                    "wkurls": urls_from_WK
                })  # store the local available WK URLs for this wikicat
        except:  # fetch data from WK

            # asynchronous query to Wikidata
            queryWK = """
			PREFIX wikibase: <http://wikiba.se/ontology#>
			PREFIX bd: <http://www.bigdata.com/rdf#>
			PREFIX mwapi: <https://www.mediawiki.org/ontology#API/>
			SELECT * WHERE {
				SERVICE wikibase:mwapi {
					bd:serviceParam wikibase:api 'Search' .
					bd:serviceParam wikibase:endpoint 'en.wikipedia.org' .
					bd:serviceParam mwapi:language "en" .
					bd:serviceParam mwapi:srsearch '""" + wcs_string + """' .
					?title wikibase:apiOutput mwapi:title .
				}
			} 		
			"""
            # start the WK query
            try:
                print("Starting WK query for: ", wcs_string)
                requestWK = _session.post(
                    _URL_WK,
                    data={"query": queryWK},
                    headers={"accept": "application/json"})
            except Exception as exc:
                print(
                    "\n*** ERROR getUrlsLinked2Wikicats(): Error starting WK query for",
                    wcs_string, ":", exc)
                _appendFile(
                    logFilename,
                    "ERROR getUrlsLinked2Wikicats(): Error starting WK query for "
                    + wcs_string + ": " + repr(exc))
                requestWK = None

            requestObjects[wikicat].update(
                {"wk":
                 requestWK})  # store the request WK object for this wikicat
            requestDone = 1

        if requestDone == 1:
            time.sleep(3)  # delay to avoid server rejects for too many queries

    print("\n** ALL PENDING QUERIES LAUNCHED\n")

    # End of the first phase. All queries launched. Now, for every wikicat, we have:
    # requestObjects[wikicat] = {"dburls": URLs} or  {"db": requestDB}
    #                       and {"wkurls": URLS} or  {"wk": requestWK}

    # let's build an object {"db": urlsDB, "wk": urlsWK} for each wikicat (each field is a URL list)
    urlsObjects = {}

    # Second phase. Now, read the results received from all queries

    for wikicat in selectedWikicats:

        # first, study results for DB

        try:
            urlsDB = requestObjects[wikicat][
                "dburls"]  # try to recover local DB results
        except:
            requestDB = requestObjects[wikicat][
                "db"]  # no local DB results, get the request DB object for this wikicat

            if requestDB == None:  # error starting DB query, return []
                urlsDB = []
            else:
                try:
                    try:
                        print("Waiting DB query result for:", wikicat)
                        responseDB = requestDB.result(
                        )  # waiting for DB query completion
                    except:
                        raise Exception("timeout")

                    if responseDB.status_code != 200:  # check if DB query ended correctly
                        raise Exception("answer is not 200, is " +
                                        str(responseDB.status_code))

                    try:
                        responseDBJson = responseDB.json()
                    except:
                        raise Exception("error decoding JSON")

                    try:
                        bindingsDB = responseDBJson["results"]["bindings"]
                    except:
                        raise Exception("no [results][bindings] in the answer")

                    # remove bindings with no pt field (isPrimaryTopicOf), because they don't correspond to DBpedia entities ???
                    bindingsDBwithPT = list(filter(_hasFieldPT, bindingsDB))
                    urlsDB = list(
                        map(lambda x: x["pt"]["value"], bindingsDBwithPT)
                    )  # keep only the URL in x["pt"]["value"]

                    if len(urlsDB) > 0:
                        _saveFile(
                            _URLs_FOLDER + "/_Wikicat_" + wikicat +
                            "_DB_Urls.txt", '\n'.join(urlsDB)
                        )  # save all results from DB for this wikicat
                    else:
                        print(
                            "*** getUrlsLinked2Wikicats(): ", wikicat,
                            " provided 0 DB URLs from " +
                            str(len(bindingsDB)) + " results")
                        _appendFile(
                            logFilename, "getUrlsLinked2Wikicats(): " +
                            wikicat + " provided 0 DB URLs from " +
                            str(len(bindingsDB)) + " results")

                except Exception as exc:
                    print(
                        "*** ERROR getUrlsLinked2Wikicats(): Error querying DB for",
                        wikicat, ":", exc)
                    _appendFile(
                        logFilename,
                        "ERROR getUrlsLinked2Wikicats(): Error querying DB for "
                        + wikicat + ": " + repr(exc))
                    urlsDB = []

        # end for DB, we already have urlsDB

        # second, study results for WK

        wcs = _getWikicatComponents(wikicat)
        wcs_string = " ".join(wcs)

        try:
            urlsWK = requestObjects[wikicat][
                "wkurls"]  # try to recover local WK results
        except:
            requestWK = requestObjects[wikicat][
                "wk"]  # no local WK results, get the request WK object for this wikicat

            # WK results come without prefix "https://en.wikipedia.org/wiki/", this function adds it
            def addWKPrefix(x):
                return "https://en.wikipedia.org/wiki/" + x["title"][
                    "value"].replace(" ", "_")

            if requestWK == None:  # error starting WK query, return []
                urlsWK = []
            else:
                try:
                    try:
                        print("Waiting WK query result for:", wikicat)
                        responseWK = requestWK.result(
                        )  # waiting for WK query completion
                    except:
                        raise Exception("timeout")

                    if responseWK.status_code != 200:  # check if WK query ended correctly
                        raise Exception("answer is not 200, is " +
                                        str(responseWK.status_code))

                    try:
                        responseWKJson = responseWK.json()
                    except:
                        raise Exception("error decoding JSON")

                    try:
                        bindingsWK = responseWKJson["results"]["bindings"]
                    except:
                        raise Exception("no [results][bindings] in the answer")

                    urlsWK = list(
                        map(addWKPrefix, bindingsWK)
                    )  # add WK prefix to x["title"]["value"], changing space by '_'

                    if len(urlsWK) > 0:
                        _saveFile(
                            _URLs_FOLDER + "/_Wikicat_" + wikicat +
                            "_WK_Urls.txt", '\n'.join(urlsWK)
                        )  # save all results from WK for this wikicat
                    else:
                        print("*** getUrlsLinked2Wikicats(): ", wikicat,
                              " provided 0 WK URLs")
                        _appendFile(
                            logFilename, "getUrlsLinked2Wikicats(): " +
                            wikicat + " provided 0 WK URLs")

                except Exception as exc:
                    print(
                        "*** ERROR getUrlsLinked2Wikicats(): Error querying WK for",
                        wcs_string, ":", exc)
                    _appendFile(
                        logFilename,
                        "ERROR getUrlsLinked2Wikicats(): Error querying WK for "
                        + wcs_string + ": " + repr(exc))
                    urlsWK = []

        # end for WK, we already have urlsWK

        # store results for this wikicat
        urlsObjects[wikicat] = {"db": urlsDB, "wk": urlsWK}

    print("\n** RECEIVED ALL RESULTS FOR PENDING QUERIES\n")

    return urlsObjects  # return results to buildCorpus function
示例#20
0
def buildCorpus2():

    logFilename = "corpus.log"
    logFile = _Open(logFilename, "w")
    logFile.write(str(datetime.now()) + "\n")
    logFile.close()

    originalText = request.values.get(
        "text")  # get parameter with original text
    lenOriginalText = len(originalText)

    selectedWikicats = json.loads(
        request.values.get("wikicats"))  # get parameter with selected wikicats
    print("Number of selected wikicats:", len(selectedWikicats))
    numUrlsDB = 0
    numUrlsWK = 0

    # store the selected wikicats in the file $CORPUS_FOLDER/length.selected.wk
    _saveFile(_CORPUS_FOLDER + "/" + str(lenOriginalText) + ".selected.wk",
              '\n'.join(selectedWikicats))

    # read the original text subjects from local store
    filename_sb = _CORPUS_FOLDER + "/" + str(
        lenOriginalText) + ".sb"  # filename for subjects (length.sb)
    try:
        with _Open(filename_sb) as fp:
            sbOriginalText = fp.read().splitlines()
    except:
        sbOriginalText = []  # no subjects for original text
        _appendFile(logFilename, "Subjects file not available: " + filename_sb)

    # Now, we have wikicats in 'selectedWikicats' and subjects in 'sbOriginalText'

    overwriteCorpus = json.loads(
        request.values.get("overwriteCorpus")
    )  # read the flag parameter overwriteCorpus from request

    if overwriteCorpus:  # if overwriteCorpus, remove current corpus  (URLs, scrapped pages and wikicats files)
        print("Deleting current URLs lists...")
        shutil.rmtree(_URLs_FOLDER)
        print("Deleting current scrapped texts...")
        shutil.rmtree(_SCRAPPED_TEXT_PAGES_FOLDER)

    # create the folder to store two files per wikicat, with the URLs linked to such wikicat coming from DB and WK
    # it must be done before calling the getUrlsLinked2Wikicats function, that it stores there files if fetched

    if not os.path.exists(_URLs_FOLDER):
        os.makedirs(_URLs_FOLDER)

    if not os.path.exists(
            _SCRAPPED_TEXT_PAGES_FOLDER
    ):  # create the folder to store scrapped pages and wikicat files
        os.makedirs(_SCRAPPED_TEXT_PAGES_FOLDER)

    # now get the URLs associated to any of those wikicats (this function is below)
    # it reads from local files if exist, otherwise it connects to Internet to fetch them and store them locally

    urlsObjects = getUrlsLinked2Wikicats(selectedWikicats, logFilename)

    # it has been received a dictionary entry for each wikicat   urlsObjects[wikicat] = {"db": urlsDB, "wk": urlsWK}
    # urlsDB and urlsWK are lists of URLs

    result = {}  # object to store the results to be returned to the request
    fullList = []  # to aggregate the full list of URLs for all wikicats

    # process all results to return

    print("Number of URLs for every wikicat: ", end='')

    for wikicat in selectedWikicats:

        # first, the results from DB

        dbUrls = urlsObjects[wikicat]["db"]  # get the set of DB URLs
        numUrlsDB += len(dbUrls)

        fullList.extend(
            dbUrls)  # add the DB URLs of current wikicat to the whole list

        # now, the results from WK

        wkUrls = urlsObjects[wikicat]["wk"]
        numUrlsWK += len(wkUrls)

        fullList.extend(wkUrls)

        longs1 = "(DB=" + str(len(dbUrls)) + ", WK=" + str(len(wkUrls)) + ")"
        print(wikicat, longs1, end=', ')
        result[wikicat] = {
            "db": len(dbUrls),
            "wk": len(wkUrls)
        }  # add results for this wikicat

    listWithoutDuplicates = list(set(fullList))  # remove duplicated URLs
    lenOfListWithoutDuplicates = len(
        listWithoutDuplicates)  # length of full list to process
    print("\n\nSummary of URLs numbers: DB=", numUrlsDB, ", WK= ", numUrlsWK,
          ", total without duplicates=", lenOfListWithoutDuplicates)

    _appendFile(
        logFilename,
        "Number of discovered URLs: " + str(lenOfListWithoutDuplicates))

    # returns number of results, the result items are only the numbers of discovered URLs
    result["totalDB"] = numUrlsDB
    result["totalWK"] = numUrlsWK
    result["totalUrls"] = len(listWithoutDuplicates)
    # return jsonify(result);  # uncomment to return to the interface without processing files

    if aux.PSTOP == True:
        input("Type ENTER to continue...")

    ###  We've got the first set of relevant URLs, available in listWithoutDuplicates, and stored in the URLs folder
    ###  Let's start the analysis of their contents

    print("\n Downloading and cleaning candidate texts...")

    scrap = _scrapFunctions()  # Create a scrapFunctions object to clean pages
    unretrieved_pages_list = []  # a list for unsuccessful pages retrieval

    nowDownloaded = 0  # number of files downloaded from Internet in this iteration

    listEnoughContent = [
    ]  # list of pages with sufficient content to proceed  ( > _CORPUS_MIN_TXT_SIZE bytes, a constant from aux.py)
    listNotEnoughContent = [
    ]  # list of pages with insufficient content to proceed

    # download not locally stored pages, scrap them, and save them
    for idx, page in enumerate(listWithoutDuplicates, start=1):

        print("(", idx, "of", lenOfListWithoutDuplicates, ") -- ", page)

        # scrapped pages will be stored classified by domain, in specific folders
        # currently, only "en.wikipedia.org" domain is used

        pageWithoutHTTP = page[2 +
                               page.find("//"):]  # get the domain of this page
        domainFolder = pageWithoutHTTP[:pageWithoutHTTP.find("/")]

        if (not os.path.exists(_SCRAPPED_TEXT_PAGES_FOLDER + "/" +
                               domainFolder)
            ):  # create this domain folder if not exists
            os.makedirs(_SCRAPPED_TEXT_PAGES_FOLDER + "/" + domainFolder)

        # the pagename will be the name of the file, with the following change
        # dir1/dir2/page --> dir1..dir2..page.txt

        onlyPage = pageWithoutHTTP[1 + pageWithoutHTTP.find("/"):]
        onlyPageChanged = onlyPage.replace("/", "..")

        # Add file extension '.txt' to page name for saving it   !!!!!!!!!!
        # pageFinalName = page[1+page.rindex("/"):]
        fileNameCandidate = _SCRAPPED_TEXT_PAGES_FOLDER + "/" + domainFolder + "/" + onlyPageChanged + ".txt"

        if (os.path.exists(fileNameCandidate)):
            print("File already available in local DB:", fileNameCandidate)
            fsize = os.path.getsize(fileNameCandidate)
            if fsize < _CORPUS_MIN_TXT_SIZE:
                listNotEnoughContent.append(page)
            else:
                listEnoughContent.append(page)
        else:  # fetch file if not exists
            try:  # Retrieves the URL, and get the page title and the scraped page content
                pageName, pageContent = scrap.scrapPage(
                    page)  # pageName result is not used
                nowDownloaded += 1
                _saveFile(fileNameCandidate, pageContent)  # Save to text file
                print("File", str(nowDownloaded), "downloaded and saved it:",
                      fileNameCandidate)

                if (len(pageContent) < _CORPUS_MIN_TXT_SIZE):
                    listNotEnoughContent.append(page)
                else:
                    listEnoughContent.append(page)
            except Exception as exc:
                _appendFile(
                    logFilename,
                    "Page " + page + " could not be retrieved: " + repr(exc))
                unretrieved_pages_list.append(page)

    # Save the unretrieved_pages_list to a file
    print("")
    print(str(len(unretrieved_pages_list)) + " unretrieved pages")
    _saveFile(_UNRETRIEVED_PAGES_FILENAME, '\n'.join(unretrieved_pages_list))

    lenListEnoughContent = len(listEnoughContent)

    _appendFile(
        logFilename, "Number of available pages with enough content: " +
        str(lenListEnoughContent))

    print("ALL PAGES AVAILABLE AND CLEANED.")
    print("New pages downloaded in this iteration:", str(nowDownloaded))
    print("Number of pages with enough content:", str(lenListEnoughContent))
    print("Number of pages without enough content:",
          str(len(listNotEnoughContent)))

    if aux.PSTOP == True:
        input("Type ENTER to continue...")

    # all the pages not already available have been now fetched and cleaned

    # # Create a new csv file if not exists. QUE SIGNIFICA W+ ? Temporalmente desactivado hasta que este claro lo que guardar
    # with _Open(_SIMILARITIES_CSV_FILENAME, 'w+') as writeFile:
    # 	# Name columns
    # 	fieldnames = ['URL', 'Euclidean Distance', 'Spacy', 'Doc2Vec Euclidean Distance',
    # 	'Doc2Vec Cosine Similarity', 'Trained Doc2Vec Euclidean Distance', 'Trained Doc2Vec Cosine Similarity',
    # 	'Wikicats Jaccard Similarity']
    #
    # 	# Create csv headers
    # 	writer = csv.DictWriter(writeFile, fieldnames=fieldnames, delimiter=";")
    #
    # 	# Write the column headers
    # 	writer.writeheader()

    print("")
    print(
        "Identifying wikicats and subjects for candidate texts with DBpedia SpotLight..."
    )
    currentDownloaded = 0

    listWithWikicats = []  # list of pages with available wikicats
    listWithoutWikicats = []  # list of pages with no wikicats

    for idx, page in enumerate(listEnoughContent, start=1):
        print("\n(", idx, "of", lenListEnoughContent, ") -- ", page)

        # Build filenames for this page
        pageWithoutHTTP = page[2 + page.find("//"):]
        domainFolder = pageWithoutHTTP[:pageWithoutHTTP.find("/")]
        onlyPage = pageWithoutHTTP[1 + pageWithoutHTTP.find("/"):]
        onlyPageChanged = onlyPage.replace("/", "..")
        fileNameCandidateBase = _SCRAPPED_TEXT_PAGES_FOLDER + "/" + domainFolder + "/" + onlyPageChanged
        fileNameCandidate = fileNameCandidateBase + ".txt"
        fileNameCandidateWikicats = fileNameCandidateBase + ".wk"  # wikicats file for this page
        fileNameCandidateSubjects = fileNameCandidateBase + ".sb"  # subjects file for this page

        # if both files (wikicats and subjects) exists, use them from local store
        if os.path.exists(fileNameCandidateWikicats) and os.path.exists(
                fileNameCandidateSubjects):
            print("Files WK and SB already available in local DB for",
                  fileNameCandidate)
            fwsize = os.path.getsize(fileNameCandidateWikicats)
            fssize = os.path.getsize(fileNameCandidateSubjects)
            # if one of these two files is empty (no wikicats or no subjects), this page will not be used
            if (fwsize == 0) or (fssize == 0):
                listWithoutWikicats.append(page)
            else:
                listWithWikicats.append(page)
        else:  # if one file not exists, fetch from Internet candidate text wikicats and subjects
            try:  # open and read text of candidate file
                candidateTextFile = _Open(fileNameCandidate, "r")
                candidate_text = candidateTextFile.read()
                print("Reading candidate text file:", fileNameCandidate)
            except:  # file that inexplicably could not be read from local store, it will not be used
                _appendFile(
                    logFilename,
                    "ERROR buildCorpus2(): Unavailable candidate file, not in the store, but it should be: "
                    + fileNameCandidate)
                listWithoutWikicats.append(page)
                continue

            print("Computing wikicats and subjects for:", page)
            candidate_text_categories = _getCategoriesInText(
                candidate_text
            )  # function _getCategoriesInText from px_DB_Manager

            if ("error" in candidate_text_categories
                ):  # error while fetching info, the page will not be used
                _appendFile(
                    logFilename,
                    "ERROR buildCorpus2(): Problem in _getCategoriesInText(candidate_text): "
                    + candidate_text_categories["error"])
                listWithoutWikicats.append(page)
                continue

            print("Wikicats and subjects downloaded for", fileNameCandidate)
            candidate_text_wikicats = list(
                filter(_filterSimpleWikicats,
                       candidate_text_categories["wikicats"])
            )  # remove simple wikicats with function from aux.py
            candidate_text_subjects = list(
                filter(_filterSimpleSubjects,
                       candidate_text_categories["subjects"])
            )  # remove simple subjects with function from aux.py

            _saveFile(fileNameCandidateWikicats,
                      '\n'.join(candidate_text_wikicats)
                      )  # save file with original text wikicats, one per line
            _saveFile(fileNameCandidateSubjects,
                      '\n'.join(candidate_text_subjects)
                      )  # save file with original text subjects, one per line
            currentDownloaded += 1

            # if no wikicats or no subjects, teh page will not be used
            if (len(candidate_text_wikicats)
                    == 0) or (len(candidate_text_subjects) == 0):
                listWithoutWikicats.append(page)
            else:
                listWithWikicats.append(page)

    lenListWithWikicats = len(listWithWikicats)

    _appendFile(
        logFilename, "Number of available pages with wikicats and subjects: " +
        str(lenListWithWikicats))

    print("")
    print("ALL WIKICATs AND SUBJECTs COMPUTED.")
    print("New items computed in this iteration:", str(currentDownloaded))
    print("Number of pages with wikicats:", str(len(listWithWikicats)))
    print("Number of pages without wikicats:", str(len(listWithoutWikicats)))

    if aux.PSTOP == True:
        input("Type ENTER to continue...")

    print("\n Computing similarities...")

    discarded_pages_list = []  # a list to save discarded pages' URLs
    similarity = _textSimilarityFunctions(
    )  # Create a textSimilarityFunctions object to measure text similarities

    # variables to store results
    sims_wk_sb = [
    ]  # list of triplets (filenameCandidate, similarityByWikicats, similarityBySubjects)
    distribution_wk = {
        "0": 0,
        "1": 0,
        "2": 0,
        "3": 0,
        "4": 0,
        "5": 0,
        "6": 0,
        "7": 0,
        "8": 0,
        "9": 0
    }
    distribution_sb = {
        "0": 0,
        "1": 0,
        "2": 0,
        "3": 0,
        "4": 0,
        "5": 0,
        "6": 0,
        "7": 0,
        "8": 0,
        "9": 0
    }

    # Measure text similarity, and discard pages (discarded_pages_list) without a minimum similarity
    for idx, page in enumerate(listWithWikicats, start=1):

        print("(", idx, "of", lenListWithWikicats, ") -- ", page)

        # Build filename for this page
        pageWithoutHTTP = page[2 + page.find("//"):]
        domainFolder = pageWithoutHTTP[:pageWithoutHTTP.find("/")]
        onlyPage = pageWithoutHTTP[1 + pageWithoutHTTP.find("/"):]
        onlyPageChanged = onlyPage.replace("/", "..")
        fileNameCandidateBase = _SCRAPPED_TEXT_PAGES_FOLDER + "/" + domainFolder + "/" + onlyPageChanged
        fileNameCandidate = fileNameCandidateBase + ".txt"
        fileNameCandidateWikicats = fileNameCandidateBase + ".wk"
        fileNameCandidateSubjects = fileNameCandidateBase + ".sb"

        # try:  # open and read local file if already exists
        # 	candidateTextFile = _Open(fileNameCandidate, "r")
        # 	pageContent = candidateTextFile.read()
        # 	print("Reading file:", fileNameCandidate)
        # except:  # file that could not be downloaded
        # 	print("ERROR buildCorpus2(): Unavailable file, not in the store, but it should be:", fileNameCandidate)
        # 	input("ENTER to continue...")
        # 	continue

        # Compare original text with the text of this candidate (in pageContent)
        # several criteria are now computed. THEIR RELEVANCE SHOULD BE STUDIED AS SOON AS POSSIBLE

        # Measure text similarity based on the Lee doc2vec model

        # doc2vec_cosineSimilarity, doc2vec_euclideanDistance = similarity.doc2VecTextSimilarity(originalText, pageContent, _LEE_D2V_MODEL)
        # print("Lee Doc2Vec CS = "+str(doc2vec_cosineSimilarity))
        # print("Lee Doc2Vec ED = "+str(doc2vec_euclideanDistance))
        #
        # # Measure text similarity based on the trained doc2vec model with our training corpus
        # doc2vec_trained_cosineSimilarity, doc2vec_trained_euclideanDistance = similarity.doc2VecTextSimilarity(originalText, pageContent, _OWN_D2V_MODEL)
        # print("Trained Doc2Vec CS = "+str(doc2vec_trained_cosineSimilarity))
        # print("Trained Doc2Vec ED = "+str(doc2vec_trained_euclideanDistance))
        #
        # # Measure the euclidean distance using SKLEARN
        # euclidean_distance = similarity.euclideanTextSimilarity(originalText, pageContent)
        # print("Euclidean distance = "+str(euclidean_distance))
        #
        # # Measure the spaCy distance
        # spacy_similarity = similarity.spacyTextSimilarity(originalText, pageContent)
        # print("Spacy similarity = "+str(spacy_similarity))

        # Measure wikicats similarity (requires complete matching)
        # wikicats_jaccard_similarity, subjects_jaccard_similarity = similarity.fullWikicatsAndSubjectsSimilarity(originalText, pageContent)
        # print("Wikicats full jaccard similarity = "+str(wikicats_jaccard_similarity))
        # print("Subjects full jaccard similarity = "+str(subjects_jaccard_similarity))

        # Measure wikicats similarity (requires shared matching)
        shared_wikicats_jaccard_similarity = similarity.sharedWikicatsSimilarity(
            selectedWikicats, fileNameCandidateWikicats, logFilename)
        print("Wikicats shared jaccard similarity = " +
              str(shared_wikicats_jaccard_similarity))

        shared_subjects_jaccard_similarity = similarity.sharedSubjectsSimilarity(
            sbOriginalText, fileNameCandidateSubjects, logFilename)
        print("Subjects shared jaccard similarity = " +
              str(shared_subjects_jaccard_similarity))

        sims_wk_sb.append(
            (fileNameCandidate, shared_wikicats_jaccard_similarity,
             shared_subjects_jaccard_similarity))

        # to compute distributions
        if shared_wikicats_jaccard_similarity == -1:
            _appendFile(
                logFilename, "ERROR computing sharedWikicatsJaccard: " +
                fileNameCandidateWikicats)
        else:
            if shared_wikicats_jaccard_similarity < 0.1:
                distribution_wk["0"] = distribution_wk["0"] + 1
            elif shared_wikicats_jaccard_similarity < 0.2:
                distribution_wk["1"] = distribution_wk["1"] + 1
            elif shared_wikicats_jaccard_similarity < 0.3:
                distribution_wk["2"] = distribution_wk["2"] + 1
            elif shared_wikicats_jaccard_similarity < 0.4:
                distribution_wk["3"] = distribution_wk["3"] + 1
            elif shared_wikicats_jaccard_similarity < 0.5:
                distribution_wk["4"] = distribution_wk["4"] + 1
            elif shared_wikicats_jaccard_similarity < 0.6:
                distribution_wk["5"] = distribution_wk["5"] + 1
            elif shared_wikicats_jaccard_similarity < 0.7:
                distribution_wk["6"] = distribution_wk["6"] + 1
            elif shared_wikicats_jaccard_similarity < 0.8:
                distribution_wk["7"] = distribution_wk["7"] + 1
            elif shared_wikicats_jaccard_similarity < 0.9:
                distribution_wk["8"] = distribution_wk["8"] + 1
            else:
                distribution_wk["9"] = distribution_wk["9"] + 1

        if shared_subjects_jaccard_similarity == -1:
            _appendFile(
                logFilename, "ERROR computing sharedSubjectsJaccard: " +
                fileNameCandidateSubjects)
        else:
            if shared_subjects_jaccard_similarity < 0.1:
                distribution_sb["0"] = distribution_sb["0"] + 1
            elif shared_subjects_jaccard_similarity < 0.2:
                distribution_sb["1"] = distribution_sb["1"] + 1
            elif shared_subjects_jaccard_similarity < 0.3:
                distribution_sb["2"] = distribution_sb["2"] + 1
            elif shared_subjects_jaccard_similarity < 0.4:
                distribution_sb["3"] = distribution_sb["3"] + 1
            elif shared_subjects_jaccard_similarity < 0.5:
                distribution_sb["4"] = distribution_sb["4"] + 1
            elif shared_subjects_jaccard_similarity < 0.6:
                distribution_sb["5"] = distribution_sb["5"] + 1
            elif shared_subjects_jaccard_similarity < 0.7:
                distribution_sb["6"] = distribution_sb["6"] + 1
            elif shared_subjects_jaccard_similarity < 0.8:
                distribution_sb["7"] = distribution_sb["7"] + 1
            elif shared_subjects_jaccard_similarity < 0.9:
                distribution_sb["8"] = distribution_sb["8"] + 1
            else:
                distribution_sb["9"] = distribution_sb["9"] + 1

        # # Save similarity to a CSV file
        # with _Open(_SIMILARITIES_CSV_FILENAME, 'a') as writeFile:
        # 	writer = csv.writer(writeFile, delimiter=';')
        # 	writer.writerow([page, euclidean_distance, spacy_similarity, doc2vec_euclideanDistance,
        # 	doc2vec_cosineSimilarity, doc2vec_trained_euclideanDistance, doc2vec_trained_cosineSimilarity, shared_wikicats_jaccard_similarity])

        # Minimum similarity for a page to be accepted.
        # WE MUST DECIDE THE MOST RELEVANT CRITERIUM TO DECIDE ON IT
        # currently, we used shared_wikicats_jaccard_similarity

    min_similarity = 0.3  # review this threshold

    both_above_min = list(
        filter(
            lambda triple: (
                (triple[1] > min_similarity) and (triple[2] > min_similarity)),
            sims_wk_sb))

    _appendFile(
        logFilename, "Number of pages with both similarities above " +
        str(min_similarity) + " = " + str(len(both_above_min)))
    print("Number of pages with both similarities above", min_similarity, "=",
          len(both_above_min))

    sims_wk_sb_str = list(
        map(
            lambda triple:
            (triple[0] + " " + str(triple[1]) + " " + str(triple[2])),
            sims_wk_sb))
    _saveFile(_CORPUS_FOLDER + "/" + str(lenOriginalText) + ".sims",
              '\n'.join(sims_wk_sb_str))

    result["distribution_wk"] = distribution_wk
    result["distribution_sb"] = distribution_sb

    # Save the discarded_pages_list to a file
    _saveFile(_DISCARDED_PAGES_FILENAME, '\n'.join(discarded_pages_list))
    # print(str(len(discarded_pages_list)) + " discarded pages")

    # print distributions
    t0 = distribution_wk["0"]
    p0 = 100 * t0 / lenListWithWikicats

    t1 = distribution_wk["1"]
    p1 = 100 * t1 / lenListWithWikicats
    t1a = t0 + t1
    p1a = 100 * t1a / lenListWithWikicats

    t2 = distribution_wk["2"]
    p2 = 100 * t2 / lenListWithWikicats
    t2a = t1a + t2
    p2a = 100 * t2a / lenListWithWikicats

    t3 = distribution_wk["3"]
    p3 = 100 * t3 / lenListWithWikicats
    t3a = t2a + t3
    p3a = 100 * t3a / lenListWithWikicats

    t4 = distribution_wk["4"]
    p4 = 100 * t4 / lenListWithWikicats
    t4a = t3a + t4
    p4a = 100 * t4a / lenListWithWikicats

    t5 = distribution_wk["5"]
    p5 = 100 * t5 / lenListWithWikicats
    t5a = t4a + t5
    p5a = 100 * t5a / lenListWithWikicats

    t6 = distribution_wk["6"]
    p6 = 100 * t6 / lenListWithWikicats
    t6a = t5a + t6
    p6a = 100 * t6a / lenListWithWikicats

    t7 = distribution_wk["7"]
    p7 = 100 * t7 / lenListWithWikicats
    t7a = t6a + t7
    p7a = 100 * t7a / lenListWithWikicats

    t8 = distribution_wk["8"]
    p8 = 100 * t8 / lenListWithWikicats
    t8a = t7a + t8
    p8a = 100 * t8a / lenListWithWikicats

    t9 = distribution_wk["9"]
    p9 = 100 * t9 / lenListWithWikicats
    t9a = t8a + t9
    p9a = 100 * t9a / lenListWithWikicats

    print("TOTAL WIKICATS = ", lenListWithWikicats)
    print("0: %6d - %8.2f - %8.2f" % (t0, p0, p0))
    print("1: %6d - %8.2f - %8.2f" % (t1, p1, p1a))
    print("2: %6d - %8.2f - %8.2f" % (t2, p2, p2a))
    print("3: %6d - %8.2f - %8.2f" % (t3, p3, p3a))
    print("4: %6d - %8.2f - %8.2f" % (t4, p4, p4a))
    print("5: %6d - %8.2f - %8.2f" % (t5, p5, p5a))
    print("6: %6d - %8.2f - %8.2f" % (t6, p6, p6a))
    print("7: %6d - %8.2f - %8.2f" % (t7, p7, p7a))
    print("8: %6d - %8.2f - %8.2f" % (t8, p8, p8a))
    print("9: %6d - %8.2f - %8.2f" % (t9, p9, p9a))

    t0 = distribution_sb["0"]
    p0 = 100 * t0 / lenListWithWikicats

    t1 = distribution_sb["1"]
    p1 = 100 * t1 / lenListWithWikicats
    t1a = t0 + t1
    p1a = 100 * t1a / lenListWithWikicats

    t2 = distribution_sb["2"]
    p2 = 100 * t2 / lenListWithWikicats
    t2a = t1a + t2
    p2a = 100 * t2a / lenListWithWikicats

    t3 = distribution_sb["3"]
    p3 = 100 * t3 / lenListWithWikicats
    t3a = t2a + t3
    p3a = 100 * t3a / lenListWithWikicats

    t4 = distribution_sb["4"]
    p4 = 100 * t4 / lenListWithWikicats
    t4a = t3a + t4
    p4a = 100 * t4a / lenListWithWikicats

    t5 = distribution_sb["5"]
    p5 = 100 * t5 / lenListWithWikicats
    t5a = t4a + t5
    p5a = 100 * t5a / lenListWithWikicats

    t6 = distribution_sb["6"]
    p6 = 100 * t6 / lenListWithWikicats
    t6a = t5a + t6
    p6a = 100 * t6a / lenListWithWikicats

    t7 = distribution_sb["7"]
    p7 = 100 * t7 / lenListWithWikicats
    t7a = t6a + t7
    p7a = 100 * t7a / lenListWithWikicats

    t8 = distribution_sb["8"]
    p8 = 100 * t8 / lenListWithWikicats
    t8a = t7a + t8
    p8a = 100 * t8a / lenListWithWikicats

    t9 = distribution_sb["9"]
    p9 = 100 * t9 / lenListWithWikicats
    t9a = t8a + t9
    p9a = 100 * t9a / lenListWithWikicats

    print("TOTAL SUBJECTS = ", lenListWithWikicats)
    print("0: %6d - %8.2f - %8.2f" % (t0, p0, p0))
    print("1: %6d - %8.2f - %8.2f" % (t1, p1, p1a))
    print("2: %6d - %8.2f - %8.2f" % (t2, p2, p2a))
    print("3: %6d - %8.2f - %8.2f" % (t3, p3, p3a))
    print("4: %6d - %8.2f - %8.2f" % (t4, p4, p4a))
    print("5: %6d - %8.2f - %8.2f" % (t5, p5, p5a))
    print("6: %6d - %8.2f - %8.2f" % (t6, p6, p6a))
    print("7: %6d - %8.2f - %8.2f" % (t7, p7, p7a))
    print("8: %6d - %8.2f - %8.2f" % (t8, p8, p8a))
    print("9: %6d - %8.2f - %8.2f" % (t9, p9, p9a))

    return jsonify(result)
示例#21
0
文件: px_aux.py 项目: agilll/Plethora
def appendFile(f, line):
    d = str(datetime.now())
    fd = _Open(f, "a")
    fd.write(d + ": " + line + "\n")
    fd.close()
示例#22
0
def appendFile(f, line):
	fd = _Open(f, "a")
	fd.write(line+"\n")
	fd.close()
示例#23
0
# this program has been launched in the Plethora/buildCorpus folder
# this is to search px_DB_Manager and px_aux in the Plethora folder
# such modules are not needed here, but in routesCorpus and routesCorpus2 modules loaded next
sys.path.append('../')

# functions to be executed when Flask requests are received
from routesCorpus import doPh1getWikicatsFromText as _doPh1getWikicatsFromText, doPh2getUrlsCandidateFiles as _doPh2getUrlsCandidateFiles
from routesCorpus import getWikicatUrls as _getWikicatUrls
from routesCorpus import doPh3downloadCandidateTexts as _doPh3downloadCandidateTexts, doPh4identifyWikicats as _doPh4identifyWikicats
from routesCorpus import doPh5computeSimilarities as _doPh5computeSimilarities, doPh6trainD2V as _doPh6trainD2V, doPh7reviewCorpus as _doPh7reviewCorpus
from aux_build import INITIAL_TEXT as _INITIAL_TEXT
import aux_build
import px_aux

# load the initial text shown at the beginning of the interface
initialTextFile = _Open(_INITIAL_TEXT, "r")
initialText = initialTextFile.read()

FLAB = False  # to control if buttons must show additional label details (change to True if argument -l)

# the following is only executed if this is the main program, that is, if we launch the corpus tool directly from the 'buildCorpus' folder
# not executed if we launch the corpus tool from the main tool, as the 'app' object is already available from the main tool
if __name__ == '__main__':
    import os

    # Flask is a module to launch a web server. It permits to map a function for each request template
    from flask import Flask, render_template, request, flash, json, jsonify, redirect, url_for, send_from_directory

    # templates dir is shared with the main tool because it is possible for this tool to be called from the main one
    template_dir = os.path.abspath('../templates')
    # Create the Flask app to manage the HTTP request