def __init__(self, wantedFeatures, pathLimit=8): self._qhelper = runQuery.QueryHelper( GraphDatabase.driver("bolt://127.0.0.1:10001", auth=basic_auth("neo4j", "12345"))) self.relation = relationship.RelationshipGetter(self._qhelper) self.word2vec = word2vec.word2vec(self._qhelper) self.pathLimit = pathLimit self.wantedFeatures = wantedFeatures self.feature_function_dict = { "pathWeight": lambda dict, articleA, articleB: self._shortestPath( dict, articleA, articleB), "keywords": lambda dict, articleA, articleB: self._getKeywords( dict, articleA, articleB), "categories": lambda dict, articleA, articleB: self._getCategories( dict, articleA, articleB), # TODO: we are calculating the word2Vec similarity twice, stupid "word2vecSimilarity": lambda dict, articleA, articleB: self.word2vec.extractWord2vec( dict, articleA, articleB), "word2vecBuckets": lambda dict, articleA, articleB: self.word2vec.extractWord2vec( dict, articleA, articleB), # relation features: "predecessorJaccard": lambda dict, articleA, articleB: self.relation. getPredecessorJaccard(dict, articleA, articleB), "successorJaccard": lambda dict, articleA, articleB: self.relation.getSuccessorJaccard( dict, articleA, articleB), "predecessorCount": lambda dict, articleA, articleB: self.relation.getPredecessorCount( dict, articleA, articleB), "successorCount": lambda dict, articleA, articleB: self.relation.getSuccessorCount( dict, articleA, articleB), "bestPredecessor": lambda dict, articleA, articleB: self.relation.getBestPredecessor( dict, articleA, articleB), "bestSuccessor": lambda dict, articleA, articleB: self.relation.getBestSuccessor( dict, articleA, articleB) } self._prevFrom = {"name": "", "outgoing": [], "incoming": []} self._prevTo = {"name": "", "outgoing": [], "incoming": []}
def getTitleMatches(nGrams): qh = runQuery.QueryHelper( GraphDatabase.driver("bolt://localhost:10001", encrypted=False, auth=basic_auth("neo4j", "12345"))) result = [] for gram in nGrams: mapping = {"title": gram} query = "MATCH (a:Page) where (a.title = {title} or a.lower_cased_title = {title}) return a.title" queryRes = qh.runQuery(query, mapping) if queryRes[0] is not None: result.append(queryRes[0]["a.title"]) return result
def getPairsFromArticle(self, title): mapping = {"title": title} query = "MATCH (a:Page) WHERE a.title = {title} Return a.text" # load text from old db that has text qh = runQuery.QueryHelper( GraphDatabase.driver("bolt://localhost:10004", encrypted=False, auth=basic_auth("neo4j", "12345"))) queryResult = qh.runQuery(query, mapping) print("Got text") #Result is a list. result[0] is a record. result[0][0] is the actual article text. Don't ask. # if no text was found, bail out print("Getting from " + title) if queryResult[0] is None: return (title, [], []) nGrams = getNgrams.getNgrams(queryResult[0][0], 5) print("Got %d grams " % len(nGrams)) result_neg = [] result_pos = [] for gram in nGrams: if gram != title: mapping = {"fromTitle": title, "gram": gram} query = '''match (a:FeaturedPage {title:{fromTitle}}) with a as x match (b:Page {lower_cased_title:{gram}}) with b as y, x as a optional match (a)-[r:TRAINING_DATA|TEST_DATA|LINKS_TO]->(y) return a.title, y.title as target, count(r) > 0 as hasLink''' res = self._qh.runQuery(query, mapping) # the result is None, if gram is not an article that is featured/good if res[0] is not None: res_hasLink = res[0]["hasLink"] if not res_hasLink: result_neg.append(res[0]["target"]) if len(result_neg) >= 65: break return (title, result_pos, result_neg)
def getPairsFromArticleThatIsFeaturedOrGood(self, title, allNodes): mapping = {"title": title} query = "MATCH (a:Page) WHERE a.title = {title} Return a.text" # load text from old db that has text qh = runQuery.QueryHelper( GraphDatabase.driver("bolt://localhost:10004", encrypted=False, auth=basic_auth("neo4j", "12345"))) queryResult = qh.runQuery(query, mapping) #Result is a list. result[0] is a record. result[0][0] is the actual article text. Don't ask. # if no text was found, bail out if queryResult[0] is None: return (title, [], []) nGrams = getNgrams.getNgrams(queryResult[0][0], 5) nGrams = list(filter(lambda x: x in allNodes, nGrams)) result_neg = [] result_pos = [] for gram in nGrams: if gram != title: mapping = {"fromTitle": title, "gram": gram} # DOES NOT WORK WITH POSITIVES!!! query = '''match (a:FeaturedPage {title:{fromTitle}}) with a as x match (b:Page {lower_cased_title:{gram}}) where ((b:FeaturedPage) or (b:GoodPage)) with b as y, x as a optional match (a)-[r:TRAINING_DATA|TEST_DATA|LINKS_TO]->(y) return a.title, y.title, count(r) > 0 as hasLink''' res = self._qh.runQuery(query, mapping) # the result is None, if gram is not an article that is featured/good if res[0] is not None: res_hasLink = res[0]["hasLink"] if not res_hasLink: result_neg.append(gram) return (title, result_pos, result_neg)
import getNgramPairs import runQuery from neo4j.v1 import GraphDatabase, basic_auth def getFeaturedOrGood(qh): res = qh.runQuery("MATCH (n:FeaturedPage) RETURN n.title UNION ALL MATCH (n:GoodPage) RETURN n.title", {}) arr = [] for x in res: arr.append(x['n.title']) #res.close() return arr qh = runQuery.QueryHelper(GraphDatabase.driver("bolt://localhost:10001", encrypted=False, auth=basic_auth("neo4j", "12345"))) finder = getNgramPairs.pairFinder(qh) all_featured_res = qh.runQuery("match (a:FeaturedPage) return a.title as title", {}) all_featured = [] for record in all_featured_res: all_featured.append(record["title"]) all_featured_len = len(all_featured) print("Got all featured articles " + str(all_featured_len)) #featuredOrGoodList = set(getFeaturedOrGood(qh)) #print("Got all good and featured articles " + str(len(featuredOrGoodList))) #with open("positives", "w", encoding="utf-8") as positives: with open("featured->all_only_negatives.csv", "w", encoding="utf-8") as negatives: for i, title in enumerate(all_featured): print("Progress: {}/{}".format(i+1,all_featured_len))