Exemplo n.º 1
0
def recoverSubtopic():

    subtopic_id = int(sys.argv[1])
    atn_db  = DBHandler('./database/Memex.db')
    atn_db.cur.execute('UPDATE subtopic SET state=0 WHERE subtopic_id=?', [subtopic_id])
    atn_db.cur.execute(
        '''
        UPDATE filter_list SET state=1 
        WHERE topic_id = (SELECT topic_id FROM subtopic WHERE subtopic_id=?)
        AND docno IN (
        SELECT DISTINCT passage.docno FROM passage
        WHERE passage.subtopic_id=?
        AND passage.state=0) AND state!=1
        ''',[subtopic_id, subtopic_id])
    atn_db.cur.execute(
        '''
        INSERT INTO filter_list (topic_id, docno, state)
        SELECT DISTINCT subtopic.topic_id, passage.docno, 1 FROM subtopic, passage
        WHERE subtopic.subtopic_id = passage.subtopic_id
        AND subtopic.subtopic_id=?
        AND passage.state = 0
        AND passage.docno NOT in (SELECT docno FROM filter_list WHERE topic_id = subtopic.topic_id); 
        ''', [subtopic_id])
    atn_db.commit()
    atn_db.close()
Exemplo n.º 2
0
def userAuthentication(username, password):
    user_db = DBHandler(db_path.user)
    result = None
    user_db.cur.execute(
        'SELECT userid, username, usercookie FROM user WHERE username = ? AND password = ?',[username, password])
    result = user_db.cur.fetchone()
    user_db.close()
    return result
Exemplo n.º 3
0
def userAuthentication(username, password):
    user_db = DBHandler(db_path.user)
    result = None
    user_db.cur.execute(
        'SELECT userid, username, usercookie FROM user WHERE username = ? AND password = ?',
        [username, password])
    result = user_db.cur.fetchone()
    user_db.close()
    return result
Exemplo n.º 4
0
def cookieAuthentication(env):
    user_db = DBHandler(db_path.user)
    result = None
    if 'HTTP_COOKIE' in env:
        for pair in env['HTTP_COOKIE'].split(';'):
            cookie = pair.strip()
            if cookie.startswith('usercookie'):
                key, value = cookie.split('=')
                user_db.cur.execute('SELECT userid, username, usercookie FROM user WHERE usercookie = ?',[value,])
                result = user_db.cur.fetchone()
                break
    user_db.close()
    return result
Exemplo n.º 5
0
def get_paper(topic):
    """
    获取 paper more信息
    :param topic:
    :return:
    """
    return DBHandler.get_paper_data(SQL.MORE_PAPER.format(topic=topic))
Exemplo n.º 6
0
def get_bilingual(topic):
    """
    获取 双语 more信息
    :param topic: 关键词
    :return: 返回双语列表
    """
    return DBHandler.get_bi_data(SQL.BI_TOPIC.format(topic=topic))
Exemplo n.º 7
0
def get_lower_concept(topic):
    """
    获取下位概念
    :param topic: 关键词
    :return: 返回下位概念 OntologyRelation
    """
    return DBHandler.get_relation(SQL.RELATION.format(topic=topic), '2')
Exemplo n.º 8
0
def get_topic(topic):
    """
    由 topic 获取论文信息
    :param topic: 论文关键词
    :return: 有数据就返回 paper 信息
    """
    return DBHandler.get_paper_data(SQL.PAPER.format(topic=topic))
Exemplo n.º 9
0
def get_researcher(name):
    """
    由 科研人员 获取论文信息
    :param name: 科研人员姓名
    :return: 有数据就返回 paper 信息
    """
    return DBHandler.get_paper_data(SQL.RESEARCHER.format(name=name))
Exemplo n.º 10
0
def getDocList1():
    topic_id, subtopic_id = int(sys.argv[1]), int(sys.argv[2])
    atn_db = DBHandler('../../../database/test.db')

    atn_db.cur.execute(
        'SELECT userid, domain_id, topic_name FROM topic WHERE topic_id=?',
        [topic_id])
    userid, domain_id, topic_name = atn_db.cur.fetchone()

    atn_db.cur.execute('SELECT username FROM user WHERE userid=?', [userid])
    username, = atn_db.cur.fetchone()

    atn_db.cur.execute(
        'SELECT subtopic_name FROM subtopic WHERE subtopic_id=?',
        [subtopic_id])
    subtopic_name, = atn_db.cur.fetchone()

    corpus = ['EBOLA', 'POLAR', 'WEAPON'][domain_id - 1]
    r = requests.get(
        nistURL +
        "CMD=UID=%d TID=%d STID=%d.%d CO=%s CMD=MORE_LIKE_THIS DATA=-" %
        (userid, topic_id, topic_id, subtopic_id, corpus),
        verify=False)

    #mylog.log_nist_findmore(username, sys.argv[1], topic_name, sys.argv[2], subtopic_name+"::"+r.url+"::")

    docs = r.content.split('\n')
    for doc in docs:
        if doc:
            print doc.split()[0]
Exemplo n.º 11
0
def get_single_bilingual(key):
    """
    获取双语数据
    :param key: 双语词汇
    :return: 对应的双语数据
    """
    return DBHandler.get_single_bi_data(SQL.BI_SINGLE_TOPIC.format(key=key))
Exemplo n.º 12
0
def get_crowd_data():
    """
    获取 crowd data
    :return:
    """

    return DBHandler.get_crowd_data(SQL.UNCHECKED)
Exemplo n.º 13
0
def save_crowd_data(data):
    """
    存储 crowd data 数据到 uncheck 表中
    :param data: crowd data
    :return:
    """
    return DBHandler.save_crowd_data(SQL.SAVE_UNCHECKED.format(author=data.author, org=data.org,
                                                               key=data.key, journal=data.journal))
Exemplo n.º 14
0
def cookieAuthentication(env):
    user_db = DBHandler(db_path.user)
    result = None
    if 'HTTP_COOKIE' in env:
        for pair in env['HTTP_COOKIE'].split(';'):
            cookie = pair.strip()
            if cookie.startswith('usercookie'):
                key, value = cookie.split('=')
                user_db.cur.execute(
                    'SELECT userid, username, usercookie FROM user WHERE usercookie = ?',
                    [
                        value,
                    ])
                result = user_db.cur.fetchone()
                break
    user_db.close()
    return result
Exemplo n.º 15
0
class test_database(unittest.TestCase):
    def setUp(self):
        self.db = DBHandler()

    # Successful json: {'Food': {'FoodNrOne': 2, 'FoodNrTwo': 2, 'FoodNrThree': 2, 'FoodNrFour': 2}}
    def test_put_food_empty_list(self):
        json = []
        res = self.db.put_food(json)
        self.assertFalse(res)

    def test_put_drinks_empty_list(self):
        json = []
        res = self.db.put_drinks(json)
        self.assertFalse(res)

    def test_put_snacks_empty_list(self):
        json = []
        res = self.db.put_snacks(json)
        self.assertFalse(res)
Exemplo n.º 16
0
def recoverSubtopic():

    subtopic_id = int(sys.argv[1])
    atn_db  = DBHandler('./database/test.db')
    atn_db.cur.execute('UPDATE subtopic SET state=0 WHERE subtopic_id=?', [subtopic_id])
    atn_db.cur.execute(
        '''
        UPDATE filter_list SET state=1 
        WHERE topic_id = (SELECT topic_id FROM subtopic WHERE subtopic_id=?)
        AND docno IN (
        SELECT DISTINCT passage.docno FROM passage
        WHERE passage.subtopic_id=?
        AND passage.state=0) AND state!=1
        ''',[subtopic_id, subtopic_id])
    atn_db.cur.execute(
        '''
        INSERT INTO filter_list (topic_id, docno, state)
        SELECT DISTINCT subtopic.topic_id, passage.docno, 1 FROM subtopic, passage
        WHERE subtopic.subtopic_id = passage.subtopic_id
        AND subtopic.subtopic_id=?
        AND passage.state = 0
        AND passage.docno NOT in (SELECT docno FROM filter_list WHERE topic_id = subtopic.topic_id); 
        ''', [subtopic_id])
    atn_db.commit()
    atn_db.close()
Exemplo n.º 17
0
def dupsummary():
    atn_db = DBHandler("./database/test.db")
    fh = open('./view/nonrelevant.csv','w')
    atn_db.cur.execute('''
        SELECT filter_list.topic_id, filter_list.docno FROM filter_list, topic 
        WHERE filter_list.topic_id=topic.topic_id
        AND topic.state!=2 
        AND topic.userid<=6
        AND filter_list.state=2
        ORDER BY filter_list.topic_id
        ''')
    dups = atn_db.cur.fetchall()
    for dup in dups:
        fh.write(str(dup[0])+','+dup[1]+'\n')
    fh.close()
Exemplo n.º 18
0
def do_search(keywords):

    global user_top_20_database

    # Fetch the current session
    request_session = request.environ["beaker.session"]
    # Fetch the users email for their session
    user_email = request_session.get("user_email", "Anonymous")

    if reduce(and_, map(lambda c: c in math_chars, keywords)):
        result = None
        try:
            result = eval(keywords.replace("^", "**").replace("[", "(").replace("]", ")"))
            return result_template(
                user_email,
                keywords,
                template(
                    """
				<p> {{keywords}} = {{result}} </p>
				""",
                    keywords=keywords,
                    result=result,
                ),
            )
        except Exception as e:
            pass

            # A list of all keywords from the search query.
    keyword_list = map(str.lower, keywords.split())
    keywords = keyword_list
    # -----------------------------------------------------------------------
    counted_keyword_list = [(keyword_list.count(x), x) for x in set(keyword_list)]
    # Sort the list in descending order of frequency.
    counted_keyword_list.sort(key=wordCount, reverse=1)

    page = request.query.get("page")
    if user_email <> "anonymous" and page == None:
        # Fetch the top 20 list for that users email
        user_top_20 = user_top_20_database.get(user_email)

        if user_top_20 != None:
            # Add to the top 20 list and update totals.
            # Iterate through the counted keyword list.
            for keywords1 in counted_keyword_list:
                # If any keywords are already in the top 20 list, merge them into the top 20 list.
                if any(keywords1[1] in element for element in user_top_20):
                    # Iterator to keep track of which keyword in the top 20 list we are at.
                    i = 0
                    # Iterate through the keyword pairs and add the values from the counted_keyword_list into the top20 list.
                    for keywords2 in user_top_20:
                        # If the keywords match.
                        if keywords2[1] == keywords1[1]:
                            # Save the count value of the user_top_20 version.
                            keyword_count = keywords2[0]
                            # Delete the old user_top_20 keyword and count.
                            del user_top_20[i]
                            # Add the keyword with updated count to the front of the top_20 list.
                            user_top_20.insert(0, ((keywords1[0] + keyword_count), keywords1[1]))
                            # Iterate
                        i = i + 1

                        # If the word isn't already in the top 20 list add it.
                else:
                    user_top_20.append(keywords1)

                    # Organize the top 20 list in decending order by the frequency of a keyword.
            user_top_20.sort(key=wordCount, reverse=1)

            # Update the database of user search history
            user_top_20_database["user_email"] = user_top_20

            # If the user_top_20 list is longer than 20 keywords, trim it.
            # while len(user_top_20) > 20:
            # 	del user_top_20[-1]

    # ------------------------------------------------------------------------

    # Grab the first keyword that was inputted by the user
    if keyword_list == []:
        results_list = []
        return generate_page_results(1, results_list, [], user_email)

    if page == None:
        page = 1
    else:
        page = int(page)

    db = DBHandler()

    # Get the word_ids through a getter in the database
    word_ids = []
    ignored_words = set(
        [
            "",
            "the",
            "of",
            "at",
            "on",
            "in",
            "is",
            "it",
            "a",
            "b",
            "c",
            "d",
            "e",
            "f",
            "g",
            "h",
            "i",
            "j",
            "k",
            "l",
            "m",
            "n",
            "o",
            "p",
            "q",
            "r",
            "s",
            "t",
            "u",
            "v",
            "w",
            "x",
            "y",
            "z",
            "and",
            "or",
        ]
    )

    for keyword in keyword_list:
        if keyword in ignored_words:
            continue
        word_ids.append(db.get_word_id(keyword))

        # Get the doc_ids from the word_ids in the database
    list_of_doc_id_lists = []
    for word_id in word_ids:
        if word_id == None:
            list_of_doc_id_lists.append([])
        else:
            list_of_doc_id_lists.append(db.get_doc_ids(word_id))

            # Find lists of doc_ids that intersect with each other, this will give us doc ids that contain both keywords
    intersecting_doc_ids = find_intersections(list_of_doc_id_lists)

    # Get the url_ranks from pagerank in the database
    ranks = db.get_pageranks(intersecting_doc_ids)

    # Zip the doc_ids with the corresponding url_ranks to make ranked_doc_ids
    ranked_doc_ids = zip(ranks, intersecting_doc_ids)

    # Sort the ranked_doc_ids to make sorted_doc_ids and get the sorted_urls from the database
    ranked_sorted_doc_ids = sorted(ranked_doc_ids, key=itemgetter(0))
    results_list = map(itemgetter(0), db.get_urls(map(itemgetter(1), ranked_sorted_doc_ids)))
    return generate_page_results(page, results_list, keyword_list, user_email)
Exemplo n.º 19
0
def dupTopic():

    userid = 30
    topic_id = 391
    # copy this topic to this userid

    atn_db = DBHandler('./database/test.db')
    atn_db.insert('topic', [
        None, "slums and orphans _ debug", None, userid, 1, 'L', 'L', '', '', 0
    ])
    new_tid = atn_db.cur.lastrowid

    atn_db.cur.execute('SELECT * FROM subtopic WHERE topic_id=? AND state=0',
                       [topic_id])
    subtopics = atn_db.cur.fetchall()
    for subtopic in subtopics:
        atn_db.insert('subtopic',
                      [None, subtopic[1] + ' _ debug', new_tid, 0, 0])
        new_sid = atn_db.cur.lastrowid
        atn_db.cur.execute(
            'SELECT * FROM passage WHERE subtopic_id=? AND state=0',
            [subtopic[0]])
        passages = atn_db.cur.fetchall()
        for passage in passages:
            atn_db.insert(
                'passage',
                [None, passage[1], passage[2], 0, 0, passage[5], new_sid, 0])

    atn_db.cur.execute('SELECT docno, state FROM filter_list WHERE topic_id=?',
                       [topic_id])

    fdocs = atn_db.cur.fetchall()

    for fdoc in fdocs:
        docno, state = fdoc
        atn_db.insert('filter_list', [new_tid, docno, state])

    atn_db.commit()
    atn_db.close()
Exemplo n.º 20
0
from flask_cors import CORS

from util import load_config
from database import DBHandler
from constants import TOPICS, COUNTRIES

here = os.path.dirname(os.path.abspath(__file__))
cfg = load_config()

app = Flask(__name__)
CORS(app, origins=cfg['access_control_allow_origin'])

mongo = DBHandler(
    host=cfg['database']['host'],
    port=cfg['database']['port'],
    db_name=cfg['database']['db_name'],
    collection_name=cfg['database']['collection_name'],
    es_host=cfg['es']['host'],
    es_port=cfg['es']['port'],
)


class InvalidUsage(Exception):

    status_code = 400

    def __init__(self, message, status_code=None, payload=None):
        Exception.__init__(self)
        self.message = message
        if status_code is not None:
            self.status_code = status_code
        self.payload = payload
Exemplo n.º 21
0
        results = sorted([(v, k) for (k, v) in results.items()])

        # return our (limited) results
        return results[:limit]

    def chi2_distance(self, histA, histB, eps = 1e-10):
        # compute the chi-squared distance
        d = 0.5 * np.sum([((a - b) ** 2) / (a + b + eps) for (a, b) in zip(histA, histB)])
        # return the chi-squared distance
        return d

if __name__ == "__main__" :
    #system call from IndriSearchInterface::performSearch with 'python image_search.py'
    queried_img_path = "image_log" #cache queried downloaded images
    topicId = int(sys.argv[1])
    atn_db  = DBHandler("../../../database/Memex.db")
    cur = atn_db.cur
    atn_db.cur.execute("SELECT para from topic where topic_id=?",[topicId])
    para, = atn_db.cur.fetchone()
    query_img_url = para.split("&",3)[-1].lstrip("q=")
    if download_photo(query_img_url,"./","query"):
        # initialize the image descriptor
        cd = ColorDescriptor((8, 12, 3))
        # load the query image and describe it
        query = cv2.imread("query")
        features = cd.describe(query)
        # perform the search
        searcher = Searcher("a.csv")
        #start = datetime.now()
        results = searcher.search(features)
        #print((datetime.now()-start).seconds)
Exemplo n.º 22
0
def main():
	atn_db  = DBHandler("../../../database/Memex.db") #database connection
	topicId = int(sys.argv[1]) #topic id
	must_list = []
	should_list = []
	query_dic = getQuery(atn_db,topicId)
	age_min = 0
	age_max = 0
	height_min = 0
	height_max = 0
	query_body = {"size":500,"query":{"bool":{"must":[],"should":[]}} }
	feature_should_search_map = {"name":"name","hairColor":"hair","eyeColor":"eye","nationality":"nationality","ethnicity":"ethnicity","reviewSite":"review","reviewSiteId":"review","email":"email","phone":"phone","state":"","city":"","price":"","multiple_providers":"","socialMedia":"","socialMediaId":"","services":"","height":"height","weight":"weight","post_date":"posted"}
	for key in query_dic:
		if key in ["phone","age","height","hairColor","eyeColor"]: #field search
			pass
		else:
			must_list.append(query_dic[key])
	if "age" in query_dic:
		age_min = int(query_dic["age"][:2])
		age_max = int(query_dic["age"][2:])
		should_list.append("age")
	if "height" in query_dic:
		height_min = int(query_dic["height"][:3])
		height_max = int(query_dic["height"][3:])
		should_list.append("height")
	if must_list: #plain text search fields
		query_body["query"]["bool"]["must"].append({"match":{"raw_content":" ".join(must_list)}})
	else: #field search
		query_list = []
		if "age" in query_dic:
			query_list.append("age")
		if "height" in query_dic:
			query_list.append("height")
		query_body["query"]["bool"]["must"].append({"match":{"raw_content":" ".join(query_list)}})
	#should_arr = []
	# for word in should_list:
	# 	dic = {}
	# 	dic["match"] = {}
	# 	dic["match"]["raw_content"] = word
	# 	should_arr.append(dic)
	#query_body["query"]["bool"]["should"] = should_arr
	if "phone" in query_dic:
		phone_number = re.sub("\D","",query_dic["phone"])
		query_body["query"]["bool"]["must"].append({"match":{"phone":phone_number }})
	if "age" in query_dic:
		query_body["query"]["bool"]["must"].append({"range" : {"age" : {"gte" : age_min,"lte" : age_max}}})
	if "height" in query_dic:
		query_body["query"]["bool"]["must"].append({"range" : {"height" : {"gte" : height_min,"lte" : height_max}}})
	if "hairColor" in query_dic:
		query_body["query"]["bool"]["must"].append({"match":{"hairColor":" ".join(query_dic["hairColor"].split(","))}})
	if "eyeColor" in query_dic:
		query_body["query"]["bool"]["must"].append({"match":{"eyeColor":" ".join(query_dic["eyeColor"].split(","))}})
	raw_content_str = query_body["query"]["bool"]["must"][0]
	if not raw_content_str["match"]["raw_content"]: #occurs when field search(phone,hairColor,eyeColor) is the only field involved
		query_body["query"]["bool"]["must"].pop(0)
	a = open("test.txt","w")
	a.write(str(query_body))
	a.close()
	es = Elasticsearch(["localhost:9200/positiongt"],request_timeout=60)
	response = es.search(body=query_body,request_timeout=60)
	documents = response["hits"]["hits"]
	results = []
	if not documents:
		hypoFields = []
		if "hairColor" in query_dic:
			hypoFields.append("hairColor")
		if "eyeColor" in query_dic:
			hypoFields.append("eyeColor")
		is_raw_content = False
		if hypoFields: #if there is no results and hairColor or eyeColor included, transfer field search(originally hairColro and eyeColor are field search) to plain text search
			for term in hypoFields:
				j = -1
				for i in range(len(query_body["query"]["bool"]["must"])):
					if "raw_content" in query_body["query"]["bool"]["must"][i]["match"]:
						query_body["query"]["bool"]["must"][i]["match"]["raw_content"] += " "+" ".join(query_dic[term].split(","))
						is_raw_content = True
					if term in query_body["query"]["bool"]["must"][i]["match"]:
						j = i
				if j>=0:
					query_body["query"]["bool"]["must"].pop(j) #remove the field search
			if not is_raw_content: #this case occurs when field search are the only fields involved.
				query_body["query"]["bool"]["must"].insert(0,{"match":{"raw_content":" ".join(map(lambda x:" ".join(query_dic[x].split(",")),hypoFields))}})
			response = es.search(body=query_body,request_timeout=60)
			documents = response["hits"]["hits"]
	if "ethnicity" in query_dic:
		f = open("nation_continent.txt")
		ethnicity_dic = yaml.load(f)
		candidate_countries = ethnicity_dic[query_dic["ethnicity"].lower()]+[query_dic["ethnicity"].capitalize()]
		for document in documents:
			if "ethnicity" in document["_source"] and document["_source"]["ethnicity"]:
				ethnicities = map(lambda x:x.lower(),document["_source"]["ethnicity"])
				#print(ethnicities)
				if query_dic["ethnicity"].capitalize() in ethnicities:
					print(document["_id"])
					results.append(document["_id"])
				else:
					isMatch = False
					for eth_candi in ethnicities:
						if isMatch:
							break
						for coun_candi in candidate_countries:
							if fuzz.ratio(eth_candi,coun_candi.lower())>=80:
								print(document["_id"])
								results.append(document["_id"])
								isMatch = True
								break

	else:
		for document in documents:
			print document["_id"]
			results.append(document["_id"])
	atn_db.cur.execute("SELECT round from search_list where topic_id=? ORDER BY round DESC LIMIT 1",[topicId])
	res = atn_db.cur.fetchone()
	round = 0
	if res:
		round, = res
	round += 1
	for documentId in results:
		#print((None,topicId,round,documentId))
		atn_db.cur.execute('INSERT INTO %s VALUES(%s)' %("search_list", "?,?,?,?"), (None,topicId,round,documentId))
	atn_db.commit()
	atn_db.close()
Exemplo n.º 23
0
    def __init__(self, db_conn, url_file):
        """Initialize the crawler with a connection to the database to populate
        and with the file containing the list of seed URLs to begin indexing."""
        self._url_queue = []
        self._doc_id_cache = {}
        self._word_id_cache = {}

        self._url_list = {}
        self._word_list = {}
        self._inverted_index = {}
        self._resolved_inverted_index = {}
        self._link_list = []

        self._db = DBHandler()

        # functions to call when entering and exiting specific tags
        self._enter = defaultdict(lambda *a, **ka: self._visit_ignore)
        self._exit = defaultdict(lambda *a, **ka: self._visit_ignore)

        # add a link to our graph, and indexing info to the related page
        self._enter["a"] = self._visit_a

        # record the currently indexed document's title an increase
        # the font size
        def visit_title(*args, **kargs):
            self._visit_title(*args, **kargs)
            self._increase_font_factor(7)(*args, **kargs)

        # increase the font size when we enter these tags
        self._enter["b"] = self._increase_font_factor(2)
        self._enter["strong"] = self._increase_font_factor(2)
        self._enter["i"] = self._increase_font_factor(1)
        self._enter["em"] = self._increase_font_factor(1)
        self._enter["h1"] = self._increase_font_factor(7)
        self._enter["h2"] = self._increase_font_factor(6)
        self._enter["h3"] = self._increase_font_factor(5)
        self._enter["h4"] = self._increase_font_factor(4)
        self._enter["h5"] = self._increase_font_factor(3)
        self._enter["title"] = visit_title

        # decrease the font size when we exit these tags
        self._exit["b"] = self._increase_font_factor(-2)
        self._exit["strong"] = self._increase_font_factor(-2)
        self._exit["i"] = self._increase_font_factor(-1)
        self._exit["em"] = self._increase_font_factor(-1)
        self._exit["h1"] = self._increase_font_factor(-7)
        self._exit["h2"] = self._increase_font_factor(-6)
        self._exit["h3"] = self._increase_font_factor(-5)
        self._exit["h4"] = self._increase_font_factor(-4)
        self._exit["h5"] = self._increase_font_factor(-3)
        self._exit["title"] = self._increase_font_factor(-7)

        # never go in and parse these tags
        self._ignored_tags = set(
            [
                "meta",
                "script",
                "link",
                "meta",
                "embed",
                "iframe",
                "frame",
                "noscript",
                "object",
                "svg",
                "canvas",
                "applet",
                "frameset",
                "textarea",
                "style",
                "area",
                "map",
                "base",
                "basefont",
                "param",
            ]
        )

        # set of words to ignore
        self._ignored_words = set(
            [
                "",
                "the",
                "of",
                "at",
                "on",
                "in",
                "is",
                "it",
                "a",
                "b",
                "c",
                "d",
                "e",
                "f",
                "g",
                "h",
                "i",
                "j",
                "k",
                "l",
                "m",
                "n",
                "o",
                "p",
                "q",
                "r",
                "s",
                "t",
                "u",
                "v",
                "w",
                "x",
                "y",
                "z",
                "and",
                "or",
            ]
        )

        # TODO remove me in real version
        self._mock_next_doc_id = 1
        self._mock_next_word_id = 1

        # keep track of some info about the page we are currently parsing
        self._curr_depth = 0
        self._curr_url = ""
        self._curr_doc_id = 0
        self._font_size = 0
        self._curr_words = None

        # get all urls into the queue
        try:
            with open(url_file, "r") as f:
                for line in f:
                    self._url_queue.append((self._fix_url(line.strip(), ""), 0))
        except IOError:
            pass
Exemplo n.º 24
0
 def setUp(self):
     self.db = DBHandler()
Exemplo n.º 25
0
class crawler(object):
    """Represents 'Googlebot'. Populates a database by crawling and indexing
    a subset of the Internet.

    This crawler keeps track of font sizes and makes it simpler to manage word
    ids and document ids."""

    def __init__(self, db_conn, url_file):
        """Initialize the crawler with a connection to the database to populate
        and with the file containing the list of seed URLs to begin indexing."""
        self._url_queue = []
        self._doc_id_cache = {}
        self._word_id_cache = {}

        self._url_list = {}
        self._word_list = {}
        self._inverted_index = {}
        self._resolved_inverted_index = {}
        self._link_list = []

        self._db = DBHandler()

        # functions to call when entering and exiting specific tags
        self._enter = defaultdict(lambda *a, **ka: self._visit_ignore)
        self._exit = defaultdict(lambda *a, **ka: self._visit_ignore)

        # add a link to our graph, and indexing info to the related page
        self._enter["a"] = self._visit_a

        # record the currently indexed document's title an increase
        # the font size
        def visit_title(*args, **kargs):
            self._visit_title(*args, **kargs)
            self._increase_font_factor(7)(*args, **kargs)

        # increase the font size when we enter these tags
        self._enter["b"] = self._increase_font_factor(2)
        self._enter["strong"] = self._increase_font_factor(2)
        self._enter["i"] = self._increase_font_factor(1)
        self._enter["em"] = self._increase_font_factor(1)
        self._enter["h1"] = self._increase_font_factor(7)
        self._enter["h2"] = self._increase_font_factor(6)
        self._enter["h3"] = self._increase_font_factor(5)
        self._enter["h4"] = self._increase_font_factor(4)
        self._enter["h5"] = self._increase_font_factor(3)
        self._enter["title"] = visit_title

        # decrease the font size when we exit these tags
        self._exit["b"] = self._increase_font_factor(-2)
        self._exit["strong"] = self._increase_font_factor(-2)
        self._exit["i"] = self._increase_font_factor(-1)
        self._exit["em"] = self._increase_font_factor(-1)
        self._exit["h1"] = self._increase_font_factor(-7)
        self._exit["h2"] = self._increase_font_factor(-6)
        self._exit["h3"] = self._increase_font_factor(-5)
        self._exit["h4"] = self._increase_font_factor(-4)
        self._exit["h5"] = self._increase_font_factor(-3)
        self._exit["title"] = self._increase_font_factor(-7)

        # never go in and parse these tags
        self._ignored_tags = set(
            [
                "meta",
                "script",
                "link",
                "meta",
                "embed",
                "iframe",
                "frame",
                "noscript",
                "object",
                "svg",
                "canvas",
                "applet",
                "frameset",
                "textarea",
                "style",
                "area",
                "map",
                "base",
                "basefont",
                "param",
            ]
        )

        # set of words to ignore
        self._ignored_words = set(
            [
                "",
                "the",
                "of",
                "at",
                "on",
                "in",
                "is",
                "it",
                "a",
                "b",
                "c",
                "d",
                "e",
                "f",
                "g",
                "h",
                "i",
                "j",
                "k",
                "l",
                "m",
                "n",
                "o",
                "p",
                "q",
                "r",
                "s",
                "t",
                "u",
                "v",
                "w",
                "x",
                "y",
                "z",
                "and",
                "or",
            ]
        )

        # TODO remove me in real version
        self._mock_next_doc_id = 1
        self._mock_next_word_id = 1

        # keep track of some info about the page we are currently parsing
        self._curr_depth = 0
        self._curr_url = ""
        self._curr_doc_id = 0
        self._font_size = 0
        self._curr_words = None

        # get all urls into the queue
        try:
            with open(url_file, "r") as f:
                for line in f:
                    self._url_queue.append((self._fix_url(line.strip(), ""), 0))
        except IOError:
            pass

    # TODO remove me in real version
    def _mock_insert_document(self, uurl):
        """A function that pretends to insert a url into a document db table
        and then returns that newly inserted document's id."""
        ret_id = self._mock_next_doc_id
        url = uurl.encode("ascii")
        self._url_list[ret_id] = url
        self._mock_next_doc_id += 1
        return ret_id

    # TODO remove me in real version
    def _mock_insert_word(self, uword):
        """A function that pretends to inster a word into the lexicon db table
        and then returns that newly inserted word's id."""
        ret_id = self._mock_next_word_id
        word = uword.encode("ascii")
        self._word_list[ret_id] = word
        self._mock_next_word_id += 1
        return ret_id

    def _insert_pagerank(self):
        """Insert generated score for each page or link to database PageRank"""
        if len(self._link_list) > 0:
            _ranked_list = pagerank.page_rank(self._link_list)
            self._db.put_pageranks(_ranked_list)

    def word_id(self, word):
        """Get the word id of some specific word."""
        if word in self._word_id_cache:
            return self._word_id_cache[word]

        # TODO: 1) add the word to the lexicon, if that fails, then the
        #          word is in the lexicon
        #       2) query the lexicon for the id assigned to this word,
        #          store it in the word id cache, and return the id.

        word_id = self._mock_insert_word(word)
        self._word_id_cache[word] = word_id
        return word_id

    def document_id(self, url):
        """Get the document id for some url."""
        if url in self._doc_id_cache:
            return self._doc_id_cache[url]

        # TODO: just like word id cache, but for documents. if the document
        #       doesn't exist in the db then only insert the url and leave
        #       the rest to their defaults.

        doc_id = self._mock_insert_document(url)
        self._doc_id_cache[url] = doc_id
        return doc_id

    def _fix_url(self, curr_url, rel):
        """Given a url and either something relative to that url or another url,
        get a properly parsed url."""

        rel_l = rel.lower()
        if rel_l.startswith("http://") or rel_l.startswith("https://"):
            curr_url, rel = rel, ""

        # compute the new url based on import
        curr_url = urlparse.urldefrag(curr_url)[0]
        parsed_url = urlparse.urlparse(curr_url)
        return urlparse.urljoin(parsed_url.geturl(), rel)

    def add_link(self, from_doc_id, to_doc_id):
        """Add a link into the database, or increase the number of links between
        two pages in the database."""
        self._link_list.append((from_doc_id, to_doc_id))

    def _visit_title(self, elem):
        """Called when visiting the <title> tag."""
        title_text = self._text_of(elem).strip()
        print "document title=" + repr(title_text)

        # TODO update document title for document id self._curr_doc_id

    def _visit_a(self, elem):
        """Called when visiting <a> tags."""

        dest_url = self._fix_url(self._curr_url, attr(elem, "href"))

        # print "href="+repr(dest_url), \
        #      "title="+repr(attr(elem,"title")), \
        #      "alt="+repr(attr(elem,"alt")), \
        #      "text="+repr(self._text_of(elem))

        # add the just found URL to the url queue
        self._url_queue.append((dest_url, self._curr_depth))

        # add a link entry into the database from the current document to the
        # other document
        self.add_link(self._curr_doc_id, self.document_id(dest_url))

        # TODO add title/alt/text to index for destination url

    def _add_words_to_document(self):
        # TODO: knowing self._curr_doc_id and the list of all words and their
        #       font sizes (in self._curr_words), add all the words into the
        #       database for this document
        print "    num words=" + str(len(self._curr_words))

    def _increase_font_factor(self, factor):
        """Increade/decrease the current font size."""

        def increase_it(elem):
            self._font_size += factor

        return increase_it

    def _visit_ignore(self, elem):
        """Ignore visiting this type of tag"""
        pass

    def _add_text(self, elem):
        """Add some text to the document. This records word ids and word font sizes
        into the self._curr_words list for later processing."""
        words = WORD_SEPARATORS.split(elem.string.lower())
        for word in words:
            word = word.strip()
            if word in self._ignored_words:
                continue
            self._curr_words.append((self.word_id(word), self._font_size))

    def _text_of(self, elem):
        """Get the text inside some element without any tags."""
        if isinstance(elem, Tag):
            text = []
            for sub_elem in elem:
                text.append(self._text_of(sub_elem))

            return " ".join(text)
        else:
            return elem.string

    def _index_document(self, soup):
        """Traverse the document in depth-first order and call functions when entering
        and leaving tags. When we come accross some text, add it into the index. This
        handles ignoring tags that we have no business looking at."""

        class DummyTag(object):
            next = False
            name = ""

        class NextTag(object):
            def __init__(self, obj):
                self.next = obj

        tag = soup.html
        stack = [DummyTag(), soup.html]

        while tag and tag.next:
            tag = tag.next

            # html tag
            if isinstance(tag, Tag):

                if tag.parent != stack[-1]:
                    self._exit[stack[-1].name.lower()](stack[-1])
                    stack.pop()

                tag_name = tag.name.lower()

                # ignore this tag and everything in it
                if tag_name in self._ignored_tags:
                    if tag.nextSibling:
                        tag = NextTag(tag.nextSibling)
                    else:
                        self._exit[stack[-1].name.lower()](stack[-1])
                        stack.pop()
                        tag = NextTag(tag.parent.nextSibling)

                    continue

                # enter the tag
                self._enter[tag_name](tag)
                stack.append(tag)

            # text (text, cdata, comments, etc.)
            else:
                self._add_text(tag)

    def crawl(self, depth=2, timeout=3):
        """Crawl the web!"""
        seen = set()

        while len(self._url_queue):

            url, depth_ = self._url_queue.pop()

            # skip this url; it's too deep
            if depth_ > depth:
                continue

            doc_id = self.document_id(url)

            # we've already seen this document
            if doc_id in seen:
                continue

            seen.add(doc_id)  # mark this document as haven't been visited

            socket = None
            try:
                socket = urllib2.urlopen(url, timeout=timeout)
                soup = BeautifulSoup(socket.read())

                self._curr_depth = depth_ + 1
                self._curr_url = url
                self._curr_doc_id = doc_id
                self._font_size = 0
                self._curr_words = []
                self._index_document(soup)
                self._add_words_to_document()
                self._get_inverted_index_helper()
                print "    url=" + repr(self._curr_url)

            except Exception as e:
                print e
                pass
            finally:
                if socket:
                    socket.close()

        self._db.put_lexicon(self._word_list)
        self._db.put_inverted_index(self._inverted_index)
        self._insert_pagerank()
        self._db.put_doc_index(self._url_list)
        # self._db.close()

    def _get_inverted_index_helper(self):
        """ Helper furction to store inverted index in a dict(). Word id used as the key, and 
        the list of document ids as the value where the list of document ids are stored in a 
        set(). """

        for key in self._curr_words:
            word_id = next(iter(key))

            if word_id not in self._inverted_index:
                self._inverted_index[word_id] = set([self._curr_doc_id])
            else:
                self._inverted_index[word_id].add(self._curr_doc_id)

    def get_inverted_index(self):
        return self._inverted_index

    def get_resolved_inverted_index(self):
        """ Word ids are replaced by the word strings, and the document Ids are replaced by
        URL strings in the inverted index. Return data in a dict(). """

        for word_id in self._inverted_index:
            urls = set([])
            for url_id in self._inverted_index[word_id]:
                urls.add(self._url_list[url_id])

            word = self._word_list[word_id]
            self._resolved_inverted_index[word] = urls
        return self._resolved_inverted_index
Exemplo n.º 26
0
    def __init__(self, db_conn, url_file):
        """Initialize the crawler with a connection to the database to populate
        and with the file containing the list of seed URLs to begin indexing."""
        self._url_queue = []
        self._doc_id_cache = {}
        self._word_id_cache = {}

        self._url_list = {}
        self._word_list = {}
        self._inverted_index = {}
        self._resolved_inverted_index = {}
        self._link_list = []

        self._db = DBHandler()

        # functions to call when entering and exiting specific tags
        self._enter = defaultdict(lambda *a, **ka: self._visit_ignore)
        self._exit = defaultdict(lambda *a, **ka: self._visit_ignore)

        # add a link to our graph, and indexing info to the related page
        self._enter['a'] = self._visit_a

        # record the currently indexed document's title an increase
        # the font size
        def visit_title(*args, **kargs):
            self._visit_title(*args, **kargs)
            self._increase_font_factor(7)(*args, **kargs)

        # increase the font size when we enter these tags
        self._enter['b'] = self._increase_font_factor(2)
        self._enter['strong'] = self._increase_font_factor(2)
        self._enter['i'] = self._increase_font_factor(1)
        self._enter['em'] = self._increase_font_factor(1)
        self._enter['h1'] = self._increase_font_factor(7)
        self._enter['h2'] = self._increase_font_factor(6)
        self._enter['h3'] = self._increase_font_factor(5)
        self._enter['h4'] = self._increase_font_factor(4)
        self._enter['h5'] = self._increase_font_factor(3)
        self._enter['title'] = visit_title

        # decrease the font size when we exit these tags
        self._exit['b'] = self._increase_font_factor(-2)
        self._exit['strong'] = self._increase_font_factor(-2)
        self._exit['i'] = self._increase_font_factor(-1)
        self._exit['em'] = self._increase_font_factor(-1)
        self._exit['h1'] = self._increase_font_factor(-7)
        self._exit['h2'] = self._increase_font_factor(-6)
        self._exit['h3'] = self._increase_font_factor(-5)
        self._exit['h4'] = self._increase_font_factor(-4)
        self._exit['h5'] = self._increase_font_factor(-3)
        self._exit['title'] = self._increase_font_factor(-7)

        # never go in and parse these tags
        self._ignored_tags = set([
            'meta',
            'script',
            'link',
            'meta',
            'embed',
            'iframe',
            'frame',
            'noscript',
            'object',
            'svg',
            'canvas',
            'applet',
            'frameset',
            'textarea',
            'style',
            'area',
            'map',
            'base',
            'basefont',
            'param',
        ])

        # set of words to ignore
        self._ignored_words = set([
            '',
            'the',
            'of',
            'at',
            'on',
            'in',
            'is',
            'it',
            'a',
            'b',
            'c',
            'd',
            'e',
            'f',
            'g',
            'h',
            'i',
            'j',
            'k',
            'l',
            'm',
            'n',
            'o',
            'p',
            'q',
            'r',
            's',
            't',
            'u',
            'v',
            'w',
            'x',
            'y',
            'z',
            'and',
            'or',
        ])

        # TODO remove me in real version
        self._mock_next_doc_id = 1
        self._mock_next_word_id = 1

        # keep track of some info about the page we are currently parsing
        self._curr_depth = 0
        self._curr_url = ""
        self._curr_doc_id = 0
        self._font_size = 0
        self._curr_words = None

        # get all urls into the queue
        try:
            with open(url_file, 'r') as f:
                for line in f:
                    self._url_queue.append((self._fix_url(line.strip(),
                                                          ""), 0))
        except IOError:
            pass
Exemplo n.º 27
0
class crawler(object):
    """Represents 'Googlebot'. Populates a database by crawling and indexing
    a subset of the Internet.

    This crawler keeps track of font sizes and makes it simpler to manage word
    ids and document ids."""
    def __init__(self, db_conn, url_file):
        """Initialize the crawler with a connection to the database to populate
        and with the file containing the list of seed URLs to begin indexing."""
        self._url_queue = []
        self._doc_id_cache = {}
        self._word_id_cache = {}

        self._url_list = {}
        self._word_list = {}
        self._inverted_index = {}
        self._resolved_inverted_index = {}
        self._link_list = []

        self._db = DBHandler()

        # functions to call when entering and exiting specific tags
        self._enter = defaultdict(lambda *a, **ka: self._visit_ignore)
        self._exit = defaultdict(lambda *a, **ka: self._visit_ignore)

        # add a link to our graph, and indexing info to the related page
        self._enter['a'] = self._visit_a

        # record the currently indexed document's title an increase
        # the font size
        def visit_title(*args, **kargs):
            self._visit_title(*args, **kargs)
            self._increase_font_factor(7)(*args, **kargs)

        # increase the font size when we enter these tags
        self._enter['b'] = self._increase_font_factor(2)
        self._enter['strong'] = self._increase_font_factor(2)
        self._enter['i'] = self._increase_font_factor(1)
        self._enter['em'] = self._increase_font_factor(1)
        self._enter['h1'] = self._increase_font_factor(7)
        self._enter['h2'] = self._increase_font_factor(6)
        self._enter['h3'] = self._increase_font_factor(5)
        self._enter['h4'] = self._increase_font_factor(4)
        self._enter['h5'] = self._increase_font_factor(3)
        self._enter['title'] = visit_title

        # decrease the font size when we exit these tags
        self._exit['b'] = self._increase_font_factor(-2)
        self._exit['strong'] = self._increase_font_factor(-2)
        self._exit['i'] = self._increase_font_factor(-1)
        self._exit['em'] = self._increase_font_factor(-1)
        self._exit['h1'] = self._increase_font_factor(-7)
        self._exit['h2'] = self._increase_font_factor(-6)
        self._exit['h3'] = self._increase_font_factor(-5)
        self._exit['h4'] = self._increase_font_factor(-4)
        self._exit['h5'] = self._increase_font_factor(-3)
        self._exit['title'] = self._increase_font_factor(-7)

        # never go in and parse these tags
        self._ignored_tags = set([
            'meta',
            'script',
            'link',
            'meta',
            'embed',
            'iframe',
            'frame',
            'noscript',
            'object',
            'svg',
            'canvas',
            'applet',
            'frameset',
            'textarea',
            'style',
            'area',
            'map',
            'base',
            'basefont',
            'param',
        ])

        # set of words to ignore
        self._ignored_words = set([
            '',
            'the',
            'of',
            'at',
            'on',
            'in',
            'is',
            'it',
            'a',
            'b',
            'c',
            'd',
            'e',
            'f',
            'g',
            'h',
            'i',
            'j',
            'k',
            'l',
            'm',
            'n',
            'o',
            'p',
            'q',
            'r',
            's',
            't',
            'u',
            'v',
            'w',
            'x',
            'y',
            'z',
            'and',
            'or',
        ])

        # TODO remove me in real version
        self._mock_next_doc_id = 1
        self._mock_next_word_id = 1

        # keep track of some info about the page we are currently parsing
        self._curr_depth = 0
        self._curr_url = ""
        self._curr_doc_id = 0
        self._font_size = 0
        self._curr_words = None

        # get all urls into the queue
        try:
            with open(url_file, 'r') as f:
                for line in f:
                    self._url_queue.append((self._fix_url(line.strip(),
                                                          ""), 0))
        except IOError:
            pass

    # TODO remove me in real version
    def _mock_insert_document(self, uurl):
        """A function that pretends to insert a url into a document db table
        and then returns that newly inserted document's id."""
        ret_id = self._mock_next_doc_id
        url = uurl.encode('ascii')
        self._url_list[ret_id] = url
        self._mock_next_doc_id += 1
        return ret_id

    # TODO remove me in real version
    def _mock_insert_word(self, uword):
        """A function that pretends to inster a word into the lexicon db table
        and then returns that newly inserted word's id."""
        ret_id = self._mock_next_word_id
        word = uword.encode('ascii')
        self._word_list[ret_id] = word
        self._mock_next_word_id += 1
        return ret_id

    def _insert_pagerank(self):
        """Insert generated score for each page or link to database PageRank"""
        if len(self._link_list) > 0:
            _ranked_list = pagerank.page_rank(self._link_list)
            self._db.put_pageranks(_ranked_list)

    def word_id(self, word):
        """Get the word id of some specific word."""
        if word in self._word_id_cache:
            return self._word_id_cache[word]

        # TODO: 1) add the word to the lexicon, if that fails, then the
        #          word is in the lexicon
        #       2) query the lexicon for the id assigned to this word,
        #          store it in the word id cache, and return the id.

        word_id = self._mock_insert_word(word)
        self._word_id_cache[word] = word_id
        return word_id

    def document_id(self, url):
        """Get the document id for some url."""
        if url in self._doc_id_cache:
            return self._doc_id_cache[url]

        # TODO: just like word id cache, but for documents. if the document
        #       doesn't exist in the db then only insert the url and leave
        #       the rest to their defaults.

        doc_id = self._mock_insert_document(url)
        self._doc_id_cache[url] = doc_id
        return doc_id

    def _fix_url(self, curr_url, rel):
        """Given a url and either something relative to that url or another url,
        get a properly parsed url."""

        rel_l = rel.lower()
        if rel_l.startswith("http://") or rel_l.startswith("https://"):
            curr_url, rel = rel, ""

        # compute the new url based on import
        curr_url = urlparse.urldefrag(curr_url)[0]
        parsed_url = urlparse.urlparse(curr_url)
        return urlparse.urljoin(parsed_url.geturl(), rel)

    def add_link(self, from_doc_id, to_doc_id):
        """Add a link into the database, or increase the number of links between
        two pages in the database."""
        self._link_list.append((from_doc_id, to_doc_id))

    def _visit_title(self, elem):
        """Called when visiting the <title> tag."""
        title_text = self._text_of(elem).strip()
        print "document title=" + repr(title_text)

        # TODO update document title for document id self._curr_doc_id

    def _visit_a(self, elem):
        """Called when visiting <a> tags."""

        dest_url = self._fix_url(self._curr_url, attr(elem, "href"))

        #print "href="+repr(dest_url), \
        #      "title="+repr(attr(elem,"title")), \
        #      "alt="+repr(attr(elem,"alt")), \
        #      "text="+repr(self._text_of(elem))

        # add the just found URL to the url queue
        self._url_queue.append((dest_url, self._curr_depth))

        # add a link entry into the database from the current document to the
        # other document
        self.add_link(self._curr_doc_id, self.document_id(dest_url))

        # TODO add title/alt/text to index for destination url

    def _add_words_to_document(self):
        # TODO: knowing self._curr_doc_id and the list of all words and their
        #       font sizes (in self._curr_words), add all the words into the
        #       database for this document
        print "    num words=" + str(len(self._curr_words))

    def _increase_font_factor(self, factor):
        """Increade/decrease the current font size."""
        def increase_it(elem):
            self._font_size += factor

        return increase_it

    def _visit_ignore(self, elem):
        """Ignore visiting this type of tag"""
        pass

    def _add_text(self, elem):
        """Add some text to the document. This records word ids and word font sizes
        into the self._curr_words list for later processing."""
        words = WORD_SEPARATORS.split(elem.string.lower())
        for word in words:
            word = word.strip()
            if word in self._ignored_words:
                continue
            self._curr_words.append((self.word_id(word), self._font_size))

    def _text_of(self, elem):
        """Get the text inside some element without any tags."""
        if isinstance(elem, Tag):
            text = []
            for sub_elem in elem:
                text.append(self._text_of(sub_elem))

            return " ".join(text)
        else:
            return elem.string

    def _index_document(self, soup):
        """Traverse the document in depth-first order and call functions when entering
        and leaving tags. When we come accross some text, add it into the index. This
        handles ignoring tags that we have no business looking at."""
        class DummyTag(object):
            next = False
            name = ''

        class NextTag(object):
            def __init__(self, obj):
                self.next = obj

        tag = soup.html
        stack = [DummyTag(), soup.html]

        while tag and tag.next:
            tag = tag.next

            # html tag
            if isinstance(tag, Tag):

                if tag.parent != stack[-1]:
                    self._exit[stack[-1].name.lower()](stack[-1])
                    stack.pop()

                tag_name = tag.name.lower()

                # ignore this tag and everything in it
                if tag_name in self._ignored_tags:
                    if tag.nextSibling:
                        tag = NextTag(tag.nextSibling)
                    else:
                        self._exit[stack[-1].name.lower()](stack[-1])
                        stack.pop()
                        tag = NextTag(tag.parent.nextSibling)

                    continue

                # enter the tag
                self._enter[tag_name](tag)
                stack.append(tag)

            # text (text, cdata, comments, etc.)
            else:
                self._add_text(tag)

    def crawl(self, depth=2, timeout=3):
        """Crawl the web!"""
        seen = set()

        while len(self._url_queue):

            url, depth_ = self._url_queue.pop()

            # skip this url; it's too deep
            if depth_ > depth:
                continue

            doc_id = self.document_id(url)

            # we've already seen this document
            if doc_id in seen:
                continue

            seen.add(doc_id)  # mark this document as haven't been visited

            socket = None
            try:
                socket = urllib2.urlopen(url, timeout=timeout)
                soup = BeautifulSoup(socket.read())

                self._curr_depth = depth_ + 1
                self._curr_url = url
                self._curr_doc_id = doc_id
                self._font_size = 0
                self._curr_words = []
                self._index_document(soup)
                self._add_words_to_document()
                self._get_inverted_index_helper()
                print "    url=" + repr(self._curr_url)

            except Exception as e:
                print e
                pass
            finally:
                if socket:
                    socket.close()

        self._db.put_lexicon(self._word_list)
        self._db.put_inverted_index(self._inverted_index)
        self._insert_pagerank()
        self._db.put_doc_index(self._url_list)
        # self._db.close()

    def _get_inverted_index_helper(self):
        """ Helper furction to store inverted index in a dict(). Word id used as the key, and 
        the list of document ids as the value where the list of document ids are stored in a 
        set(). """

        for key in self._curr_words:
            word_id = next(iter(key))

            if word_id not in self._inverted_index:
                self._inverted_index[word_id] = set([self._curr_doc_id])
            else:
                self._inverted_index[word_id].add(self._curr_doc_id)

    def get_inverted_index(self):
        return self._inverted_index

    def get_resolved_inverted_index(self):
        """ Word ids are replaced by the word strings, and the document Ids are replaced by
        URL strings in the inverted index. Return data in a dict(). """

        for word_id in self._inverted_index:
            urls = set([])
            for url_id in self._inverted_index[word_id]:
                urls.add(self._url_list[url_id])

            word = self._word_list[word_id]
            self._resolved_inverted_index[word] = urls
        return self._resolved_inverted_index
Exemplo n.º 28
0
def do_search(keywords):

    global user_top_20_database

    # Fetch the current session
    request_session = request.environ['beaker.session']
    # Fetch the users email for their session
    user_email = request_session.get('user_email', 'Anonymous')

    if reduce(and_, map(lambda c: c in math_chars, keywords)):
        result = None
        try:
            result = eval(
                keywords.replace('^', '**').replace('[',
                                                    '(').replace(']', ')'))
            return result_template(
                user_email, keywords,
                template('''
				<p> {{keywords}} = {{result}} </p>
				''',
                         keywords=keywords,
                         result=result))
        except Exception as e:
            pass

    # A list of all keywords from the search query.
    keyword_list = map(str.lower, keywords.split())
    keywords = keyword_list
    #-----------------------------------------------------------------------
    counted_keyword_list = [(keyword_list.count(x), x)
                            for x in set(keyword_list)]
    # Sort the list in descending order of frequency.
    counted_keyword_list.sort(key=wordCount, reverse=1)

    page = request.query.get('page')
    if user_email <> 'anonymous' and page == None:
        # Fetch the top 20 list for that users email
        user_top_20 = user_top_20_database.get(user_email)

        if user_top_20 != None:
            # Add to the top 20 list and update totals.
            # Iterate through the counted keyword list.
            for keywords1 in counted_keyword_list:
                # If any keywords are already in the top 20 list, merge them into the top 20 list.
                if any(keywords1[1] in element for element in user_top_20):
                    # Iterator to keep track of which keyword in the top 20 list we are at.
                    i = 0
                    # Iterate through the keyword pairs and add the values from the counted_keyword_list into the top20 list.
                    for keywords2 in user_top_20:
                        # If the keywords match.
                        if keywords2[1] == keywords1[1]:
                            # Save the count value of the user_top_20 version.
                            keyword_count = keywords2[0]
                            # Delete the old user_top_20 keyword and count.
                            del user_top_20[i]
                            # Add the keyword with updated count to the front of the top_20 list.
                            user_top_20.insert(
                                0,
                                ((keywords1[0] + keyword_count), keywords1[1]))
                        # Iterate
                        i = i + 1

                # If the word isn't already in the top 20 list add it.
                else:
                    user_top_20.append(keywords1)

            # Organize the top 20 list in decending order by the frequency of a keyword.
            user_top_20.sort(key=wordCount, reverse=1)

            # Update the database of user search history
            user_top_20_database["user_email"] = user_top_20

            # If the user_top_20 list is longer than 20 keywords, trim it.
            # while len(user_top_20) > 20:
            #	del user_top_20[-1]


#------------------------------------------------------------------------

# Grab the first keyword that was inputted by the user
    if keyword_list == []:
        results_list = []
        return generate_page_results(1, results_list, [], user_email)

    if page == None:
        page = 1
    else:
        page = int(page)

    db = DBHandler()

    # Get the word_ids through a getter in the database
    word_ids = []
    ignored_words = set([
        '',
        'the',
        'of',
        'at',
        'on',
        'in',
        'is',
        'it',
        'a',
        'b',
        'c',
        'd',
        'e',
        'f',
        'g',
        'h',
        'i',
        'j',
        'k',
        'l',
        'm',
        'n',
        'o',
        'p',
        'q',
        'r',
        's',
        't',
        'u',
        'v',
        'w',
        'x',
        'y',
        'z',
        'and',
        'or',
    ])

    for keyword in keyword_list:
        if keyword in ignored_words:
            continue
        word_ids.append(db.get_word_id(keyword))

    # Get the doc_ids from the word_ids in the database
    list_of_doc_id_lists = []
    for word_id in word_ids:
        if word_id == None:
            list_of_doc_id_lists.append([])
        else:
            list_of_doc_id_lists.append(db.get_doc_ids(word_id))

    # Find lists of doc_ids that intersect with each other, this will give us doc ids that contain both keywords
    intersecting_doc_ids = find_intersections(list_of_doc_id_lists)

    # Get the url_ranks from pagerank in the database
    ranks = db.get_pageranks(intersecting_doc_ids)

    # Zip the doc_ids with the corresponding url_ranks to make ranked_doc_ids
    ranked_doc_ids = zip(ranks, intersecting_doc_ids)

    # Sort the ranked_doc_ids to make sorted_doc_ids and get the sorted_urls from the database
    ranked_sorted_doc_ids = sorted(ranked_doc_ids, key=itemgetter(0))
    results_list = map(itemgetter(0),
                       db.get_urls(map(itemgetter(1), ranked_sorted_doc_ids)))
    return generate_page_results(page, results_list, keyword_list, user_email)
Exemplo n.º 29
0
#from flask.ext.restful import Api, Resource, reqparse
from flask_restful import Api, Resource, reqparse
from flask_restful.utils import cors
#from flask.ext.restful.utils import cors
#from flask.ext.cors import CORS
from flask_cors import CORS

from modules.ssh import QoSHandler

app = Flask(__name__)
CORS(app)
api = Api(app)

config = json.load(open('./config.json', 'r'))

db_handler = DBHandler(config)


class UserAPI(Resource):
    def __init__(self):
        self.reqparse = reqparse.RequestParser()

        super(UserAPI, self).__init__()

    @cors.crossdomain(origin='*')
    def get(self, uid):
        users = []
        if uid != 'all':
            users = uid.split(',')

        resultset = db_handler.get_users(users)