Пример #1
0
    def __init__(self, wantedFeatures, pathLimit=8):

        self._qhelper = runQuery.QueryHelper(
            GraphDatabase.driver("bolt://127.0.0.1:10001",
                                 auth=basic_auth("neo4j", "12345")))

        self.relation = relationship.RelationshipGetter(self._qhelper)
        self.word2vec = word2vec.word2vec(self._qhelper)

        self.pathLimit = pathLimit
        self.wantedFeatures = wantedFeatures
        self.feature_function_dict = {
            "pathWeight":
            lambda dict, articleA, articleB: self._shortestPath(
                dict, articleA, articleB),
            "keywords":
            lambda dict, articleA, articleB: self._getKeywords(
                dict, articleA, articleB),
            "categories":
            lambda dict, articleA, articleB: self._getCategories(
                dict, articleA, articleB),
            # TODO: we are calculating the word2Vec similarity twice, stupid
            "word2vecSimilarity":
            lambda dict, articleA, articleB: self.word2vec.extractWord2vec(
                dict, articleA, articleB),
            "word2vecBuckets":
            lambda dict, articleA, articleB: self.word2vec.extractWord2vec(
                dict, articleA, articleB),

            # relation features:
            "predecessorJaccard":
            lambda dict, articleA, articleB: self.relation.
            getPredecessorJaccard(dict, articleA, articleB),
            "successorJaccard":
            lambda dict, articleA, articleB: self.relation.getSuccessorJaccard(
                dict, articleA, articleB),
            "predecessorCount":
            lambda dict, articleA, articleB: self.relation.getPredecessorCount(
                dict, articleA, articleB),
            "successorCount":
            lambda dict, articleA, articleB: self.relation.getSuccessorCount(
                dict, articleA, articleB),
            "bestPredecessor":
            lambda dict, articleA, articleB: self.relation.getBestPredecessor(
                dict, articleA, articleB),
            "bestSuccessor":
            lambda dict, articleA, articleB: self.relation.getBestSuccessor(
                dict, articleA, articleB)
        }

        self._prevFrom = {"name": "", "outgoing": [], "incoming": []}
        self._prevTo = {"name": "", "outgoing": [], "incoming": []}
Пример #2
0
def getTitleMatches(nGrams):
    qh = runQuery.QueryHelper(
        GraphDatabase.driver("bolt://localhost:10001",
                             encrypted=False,
                             auth=basic_auth("neo4j", "12345")))

    result = []

    for gram in nGrams:
        mapping = {"title": gram}
        query = "MATCH (a:Page) where (a.title = {title} or a.lower_cased_title = {title}) return a.title"

        queryRes = qh.runQuery(query, mapping)

        if queryRes[0] is not None:
            result.append(queryRes[0]["a.title"])

    return result
Пример #3
0
    def getPairsFromArticle(self, title):
        mapping = {"title": title}
        query = "MATCH (a:Page) WHERE a.title = {title} Return a.text"

        # load text from old db that has text
        qh = runQuery.QueryHelper(
            GraphDatabase.driver("bolt://localhost:10004",
                                 encrypted=False,
                                 auth=basic_auth("neo4j", "12345")))
        queryResult = qh.runQuery(query, mapping)
        print("Got text")

        #Result is a list. result[0] is a record. result[0][0] is the actual article text. Don't ask.
        # if no text was found, bail out
        print("Getting from " + title)
        if queryResult[0] is None:
            return (title, [], [])
        nGrams = getNgrams.getNgrams(queryResult[0][0], 5)
        print("Got %d grams " % len(nGrams))
        result_neg = []
        result_pos = []

        for gram in nGrams:
            if gram != title:
                mapping = {"fromTitle": title, "gram": gram}
                query = '''match (a:FeaturedPage {title:{fromTitle}})
with a as x
match (b:Page {lower_cased_title:{gram}})
with b as y, x as a
optional match (a)-[r:TRAINING_DATA|TEST_DATA|LINKS_TO]->(y) return a.title, y.title as target, count(r) > 0 as hasLink'''
                res = self._qh.runQuery(query, mapping)
                # the result is None, if gram is not an article that is featured/good
                if res[0] is not None:
                    res_hasLink = res[0]["hasLink"]
                    if not res_hasLink:
                        result_neg.append(res[0]["target"])
                        if len(result_neg) >= 65:
                            break

        return (title, result_pos, result_neg)
Пример #4
0
    def getPairsFromArticleThatIsFeaturedOrGood(self, title, allNodes):
        mapping = {"title": title}
        query = "MATCH (a:Page) WHERE a.title = {title} Return a.text"

        # load text from old db that has text
        qh = runQuery.QueryHelper(
            GraphDatabase.driver("bolt://localhost:10004",
                                 encrypted=False,
                                 auth=basic_auth("neo4j", "12345")))
        queryResult = qh.runQuery(query, mapping)

        #Result is a list. result[0] is a record. result[0][0] is the actual article text. Don't ask.
        # if no text was found, bail out
        if queryResult[0] is None:
            return (title, [], [])
        nGrams = getNgrams.getNgrams(queryResult[0][0], 5)
        nGrams = list(filter(lambda x: x in allNodes, nGrams))
        result_neg = []
        result_pos = []

        for gram in nGrams:
            if gram != title:
                mapping = {"fromTitle": title, "gram": gram}

                # DOES NOT WORK WITH POSITIVES!!!

                query = '''match (a:FeaturedPage {title:{fromTitle}})
with a as x
match (b:Page {lower_cased_title:{gram}}) where ((b:FeaturedPage) or (b:GoodPage))
with b as y, x as a
optional match (a)-[r:TRAINING_DATA|TEST_DATA|LINKS_TO]->(y) return a.title, y.title, count(r) > 0 as hasLink'''
                res = self._qh.runQuery(query, mapping)
                # the result is None, if gram is not an article that is featured/good
                if res[0] is not None:
                    res_hasLink = res[0]["hasLink"]
                    if not res_hasLink:
                        result_neg.append(gram)

        return (title, result_pos, result_neg)
Пример #5
0
import getNgramPairs
import runQuery
from neo4j.v1 import GraphDatabase, basic_auth

def getFeaturedOrGood(qh):
    res = qh.runQuery("MATCH (n:FeaturedPage) RETURN n.title UNION ALL MATCH (n:GoodPage) RETURN n.title", {})
    arr = []
    for x in res:
        arr.append(x['n.title'])
    #res.close()
    return arr

qh = runQuery.QueryHelper(GraphDatabase.driver("bolt://localhost:10001", encrypted=False, auth=basic_auth("neo4j", "12345")))
finder = getNgramPairs.pairFinder(qh)

all_featured_res = qh.runQuery("match (a:FeaturedPage) return a.title as title", {})
all_featured = []
for record in all_featured_res:
    all_featured.append(record["title"])

all_featured_len = len(all_featured)
print("Got all featured articles " + str(all_featured_len))

#featuredOrGoodList = set(getFeaturedOrGood(qh))

#print("Got all good and featured articles " + str(len(featuredOrGoodList)))

#with open("positives", "w", encoding="utf-8") as positives:
with open("featured->all_only_negatives.csv", "w", encoding="utf-8") as negatives:
    for i, title in enumerate(all_featured):
        print("Progress: {}/{}".format(i+1,all_featured_len))