def recoverSubtopic(): subtopic_id = int(sys.argv[1]) atn_db = DBHandler('./database/Memex.db') atn_db.cur.execute('UPDATE subtopic SET state=0 WHERE subtopic_id=?', [subtopic_id]) atn_db.cur.execute( ''' UPDATE filter_list SET state=1 WHERE topic_id = (SELECT topic_id FROM subtopic WHERE subtopic_id=?) AND docno IN ( SELECT DISTINCT passage.docno FROM passage WHERE passage.subtopic_id=? AND passage.state=0) AND state!=1 ''',[subtopic_id, subtopic_id]) atn_db.cur.execute( ''' INSERT INTO filter_list (topic_id, docno, state) SELECT DISTINCT subtopic.topic_id, passage.docno, 1 FROM subtopic, passage WHERE subtopic.subtopic_id = passage.subtopic_id AND subtopic.subtopic_id=? AND passage.state = 0 AND passage.docno NOT in (SELECT docno FROM filter_list WHERE topic_id = subtopic.topic_id); ''', [subtopic_id]) atn_db.commit() atn_db.close()
def userAuthentication(username, password): user_db = DBHandler(db_path.user) result = None user_db.cur.execute( 'SELECT userid, username, usercookie FROM user WHERE username = ? AND password = ?',[username, password]) result = user_db.cur.fetchone() user_db.close() return result
def userAuthentication(username, password): user_db = DBHandler(db_path.user) result = None user_db.cur.execute( 'SELECT userid, username, usercookie FROM user WHERE username = ? AND password = ?', [username, password]) result = user_db.cur.fetchone() user_db.close() return result
def cookieAuthentication(env): user_db = DBHandler(db_path.user) result = None if 'HTTP_COOKIE' in env: for pair in env['HTTP_COOKIE'].split(';'): cookie = pair.strip() if cookie.startswith('usercookie'): key, value = cookie.split('=') user_db.cur.execute('SELECT userid, username, usercookie FROM user WHERE usercookie = ?',[value,]) result = user_db.cur.fetchone() break user_db.close() return result
def get_paper(topic): """ 获取 paper more信息 :param topic: :return: """ return DBHandler.get_paper_data(SQL.MORE_PAPER.format(topic=topic))
def get_bilingual(topic): """ 获取 双语 more信息 :param topic: 关键词 :return: 返回双语列表 """ return DBHandler.get_bi_data(SQL.BI_TOPIC.format(topic=topic))
def get_lower_concept(topic): """ 获取下位概念 :param topic: 关键词 :return: 返回下位概念 OntologyRelation """ return DBHandler.get_relation(SQL.RELATION.format(topic=topic), '2')
def get_topic(topic): """ 由 topic 获取论文信息 :param topic: 论文关键词 :return: 有数据就返回 paper 信息 """ return DBHandler.get_paper_data(SQL.PAPER.format(topic=topic))
def get_researcher(name): """ 由 科研人员 获取论文信息 :param name: 科研人员姓名 :return: 有数据就返回 paper 信息 """ return DBHandler.get_paper_data(SQL.RESEARCHER.format(name=name))
def getDocList1(): topic_id, subtopic_id = int(sys.argv[1]), int(sys.argv[2]) atn_db = DBHandler('../../../database/test.db') atn_db.cur.execute( 'SELECT userid, domain_id, topic_name FROM topic WHERE topic_id=?', [topic_id]) userid, domain_id, topic_name = atn_db.cur.fetchone() atn_db.cur.execute('SELECT username FROM user WHERE userid=?', [userid]) username, = atn_db.cur.fetchone() atn_db.cur.execute( 'SELECT subtopic_name FROM subtopic WHERE subtopic_id=?', [subtopic_id]) subtopic_name, = atn_db.cur.fetchone() corpus = ['EBOLA', 'POLAR', 'WEAPON'][domain_id - 1] r = requests.get( nistURL + "CMD=UID=%d TID=%d STID=%d.%d CO=%s CMD=MORE_LIKE_THIS DATA=-" % (userid, topic_id, topic_id, subtopic_id, corpus), verify=False) #mylog.log_nist_findmore(username, sys.argv[1], topic_name, sys.argv[2], subtopic_name+"::"+r.url+"::") docs = r.content.split('\n') for doc in docs: if doc: print doc.split()[0]
def get_single_bilingual(key): """ 获取双语数据 :param key: 双语词汇 :return: 对应的双语数据 """ return DBHandler.get_single_bi_data(SQL.BI_SINGLE_TOPIC.format(key=key))
def get_crowd_data(): """ 获取 crowd data :return: """ return DBHandler.get_crowd_data(SQL.UNCHECKED)
def save_crowd_data(data): """ 存储 crowd data 数据到 uncheck 表中 :param data: crowd data :return: """ return DBHandler.save_crowd_data(SQL.SAVE_UNCHECKED.format(author=data.author, org=data.org, key=data.key, journal=data.journal))
def cookieAuthentication(env): user_db = DBHandler(db_path.user) result = None if 'HTTP_COOKIE' in env: for pair in env['HTTP_COOKIE'].split(';'): cookie = pair.strip() if cookie.startswith('usercookie'): key, value = cookie.split('=') user_db.cur.execute( 'SELECT userid, username, usercookie FROM user WHERE usercookie = ?', [ value, ]) result = user_db.cur.fetchone() break user_db.close() return result
class test_database(unittest.TestCase): def setUp(self): self.db = DBHandler() # Successful json: {'Food': {'FoodNrOne': 2, 'FoodNrTwo': 2, 'FoodNrThree': 2, 'FoodNrFour': 2}} def test_put_food_empty_list(self): json = [] res = self.db.put_food(json) self.assertFalse(res) def test_put_drinks_empty_list(self): json = [] res = self.db.put_drinks(json) self.assertFalse(res) def test_put_snacks_empty_list(self): json = [] res = self.db.put_snacks(json) self.assertFalse(res)
def recoverSubtopic(): subtopic_id = int(sys.argv[1]) atn_db = DBHandler('./database/test.db') atn_db.cur.execute('UPDATE subtopic SET state=0 WHERE subtopic_id=?', [subtopic_id]) atn_db.cur.execute( ''' UPDATE filter_list SET state=1 WHERE topic_id = (SELECT topic_id FROM subtopic WHERE subtopic_id=?) AND docno IN ( SELECT DISTINCT passage.docno FROM passage WHERE passage.subtopic_id=? AND passage.state=0) AND state!=1 ''',[subtopic_id, subtopic_id]) atn_db.cur.execute( ''' INSERT INTO filter_list (topic_id, docno, state) SELECT DISTINCT subtopic.topic_id, passage.docno, 1 FROM subtopic, passage WHERE subtopic.subtopic_id = passage.subtopic_id AND subtopic.subtopic_id=? AND passage.state = 0 AND passage.docno NOT in (SELECT docno FROM filter_list WHERE topic_id = subtopic.topic_id); ''', [subtopic_id]) atn_db.commit() atn_db.close()
def dupsummary(): atn_db = DBHandler("./database/test.db") fh = open('./view/nonrelevant.csv','w') atn_db.cur.execute(''' SELECT filter_list.topic_id, filter_list.docno FROM filter_list, topic WHERE filter_list.topic_id=topic.topic_id AND topic.state!=2 AND topic.userid<=6 AND filter_list.state=2 ORDER BY filter_list.topic_id ''') dups = atn_db.cur.fetchall() for dup in dups: fh.write(str(dup[0])+','+dup[1]+'\n') fh.close()
def do_search(keywords): global user_top_20_database # Fetch the current session request_session = request.environ["beaker.session"] # Fetch the users email for their session user_email = request_session.get("user_email", "Anonymous") if reduce(and_, map(lambda c: c in math_chars, keywords)): result = None try: result = eval(keywords.replace("^", "**").replace("[", "(").replace("]", ")")) return result_template( user_email, keywords, template( """ <p> {{keywords}} = {{result}} </p> """, keywords=keywords, result=result, ), ) except Exception as e: pass # A list of all keywords from the search query. keyword_list = map(str.lower, keywords.split()) keywords = keyword_list # ----------------------------------------------------------------------- counted_keyword_list = [(keyword_list.count(x), x) for x in set(keyword_list)] # Sort the list in descending order of frequency. counted_keyword_list.sort(key=wordCount, reverse=1) page = request.query.get("page") if user_email <> "anonymous" and page == None: # Fetch the top 20 list for that users email user_top_20 = user_top_20_database.get(user_email) if user_top_20 != None: # Add to the top 20 list and update totals. # Iterate through the counted keyword list. for keywords1 in counted_keyword_list: # If any keywords are already in the top 20 list, merge them into the top 20 list. if any(keywords1[1] in element for element in user_top_20): # Iterator to keep track of which keyword in the top 20 list we are at. i = 0 # Iterate through the keyword pairs and add the values from the counted_keyword_list into the top20 list. for keywords2 in user_top_20: # If the keywords match. if keywords2[1] == keywords1[1]: # Save the count value of the user_top_20 version. keyword_count = keywords2[0] # Delete the old user_top_20 keyword and count. del user_top_20[i] # Add the keyword with updated count to the front of the top_20 list. user_top_20.insert(0, ((keywords1[0] + keyword_count), keywords1[1])) # Iterate i = i + 1 # If the word isn't already in the top 20 list add it. else: user_top_20.append(keywords1) # Organize the top 20 list in decending order by the frequency of a keyword. user_top_20.sort(key=wordCount, reverse=1) # Update the database of user search history user_top_20_database["user_email"] = user_top_20 # If the user_top_20 list is longer than 20 keywords, trim it. # while len(user_top_20) > 20: # del user_top_20[-1] # ------------------------------------------------------------------------ # Grab the first keyword that was inputted by the user if keyword_list == []: results_list = [] return generate_page_results(1, results_list, [], user_email) if page == None: page = 1 else: page = int(page) db = DBHandler() # Get the word_ids through a getter in the database word_ids = [] ignored_words = set( [ "", "the", "of", "at", "on", "in", "is", "it", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "and", "or", ] ) for keyword in keyword_list: if keyword in ignored_words: continue word_ids.append(db.get_word_id(keyword)) # Get the doc_ids from the word_ids in the database list_of_doc_id_lists = [] for word_id in word_ids: if word_id == None: list_of_doc_id_lists.append([]) else: list_of_doc_id_lists.append(db.get_doc_ids(word_id)) # Find lists of doc_ids that intersect with each other, this will give us doc ids that contain both keywords intersecting_doc_ids = find_intersections(list_of_doc_id_lists) # Get the url_ranks from pagerank in the database ranks = db.get_pageranks(intersecting_doc_ids) # Zip the doc_ids with the corresponding url_ranks to make ranked_doc_ids ranked_doc_ids = zip(ranks, intersecting_doc_ids) # Sort the ranked_doc_ids to make sorted_doc_ids and get the sorted_urls from the database ranked_sorted_doc_ids = sorted(ranked_doc_ids, key=itemgetter(0)) results_list = map(itemgetter(0), db.get_urls(map(itemgetter(1), ranked_sorted_doc_ids))) return generate_page_results(page, results_list, keyword_list, user_email)
def dupTopic(): userid = 30 topic_id = 391 # copy this topic to this userid atn_db = DBHandler('./database/test.db') atn_db.insert('topic', [ None, "slums and orphans _ debug", None, userid, 1, 'L', 'L', '', '', 0 ]) new_tid = atn_db.cur.lastrowid atn_db.cur.execute('SELECT * FROM subtopic WHERE topic_id=? AND state=0', [topic_id]) subtopics = atn_db.cur.fetchall() for subtopic in subtopics: atn_db.insert('subtopic', [None, subtopic[1] + ' _ debug', new_tid, 0, 0]) new_sid = atn_db.cur.lastrowid atn_db.cur.execute( 'SELECT * FROM passage WHERE subtopic_id=? AND state=0', [subtopic[0]]) passages = atn_db.cur.fetchall() for passage in passages: atn_db.insert( 'passage', [None, passage[1], passage[2], 0, 0, passage[5], new_sid, 0]) atn_db.cur.execute('SELECT docno, state FROM filter_list WHERE topic_id=?', [topic_id]) fdocs = atn_db.cur.fetchall() for fdoc in fdocs: docno, state = fdoc atn_db.insert('filter_list', [new_tid, docno, state]) atn_db.commit() atn_db.close()
from flask_cors import CORS from util import load_config from database import DBHandler from constants import TOPICS, COUNTRIES here = os.path.dirname(os.path.abspath(__file__)) cfg = load_config() app = Flask(__name__) CORS(app, origins=cfg['access_control_allow_origin']) mongo = DBHandler( host=cfg['database']['host'], port=cfg['database']['port'], db_name=cfg['database']['db_name'], collection_name=cfg['database']['collection_name'], es_host=cfg['es']['host'], es_port=cfg['es']['port'], ) class InvalidUsage(Exception): status_code = 400 def __init__(self, message, status_code=None, payload=None): Exception.__init__(self) self.message = message if status_code is not None: self.status_code = status_code self.payload = payload
results = sorted([(v, k) for (k, v) in results.items()]) # return our (limited) results return results[:limit] def chi2_distance(self, histA, histB, eps = 1e-10): # compute the chi-squared distance d = 0.5 * np.sum([((a - b) ** 2) / (a + b + eps) for (a, b) in zip(histA, histB)]) # return the chi-squared distance return d if __name__ == "__main__" : #system call from IndriSearchInterface::performSearch with 'python image_search.py' queried_img_path = "image_log" #cache queried downloaded images topicId = int(sys.argv[1]) atn_db = DBHandler("../../../database/Memex.db") cur = atn_db.cur atn_db.cur.execute("SELECT para from topic where topic_id=?",[topicId]) para, = atn_db.cur.fetchone() query_img_url = para.split("&",3)[-1].lstrip("q=") if download_photo(query_img_url,"./","query"): # initialize the image descriptor cd = ColorDescriptor((8, 12, 3)) # load the query image and describe it query = cv2.imread("query") features = cd.describe(query) # perform the search searcher = Searcher("a.csv") #start = datetime.now() results = searcher.search(features) #print((datetime.now()-start).seconds)
def main(): atn_db = DBHandler("../../../database/Memex.db") #database connection topicId = int(sys.argv[1]) #topic id must_list = [] should_list = [] query_dic = getQuery(atn_db,topicId) age_min = 0 age_max = 0 height_min = 0 height_max = 0 query_body = {"size":500,"query":{"bool":{"must":[],"should":[]}} } feature_should_search_map = {"name":"name","hairColor":"hair","eyeColor":"eye","nationality":"nationality","ethnicity":"ethnicity","reviewSite":"review","reviewSiteId":"review","email":"email","phone":"phone","state":"","city":"","price":"","multiple_providers":"","socialMedia":"","socialMediaId":"","services":"","height":"height","weight":"weight","post_date":"posted"} for key in query_dic: if key in ["phone","age","height","hairColor","eyeColor"]: #field search pass else: must_list.append(query_dic[key]) if "age" in query_dic: age_min = int(query_dic["age"][:2]) age_max = int(query_dic["age"][2:]) should_list.append("age") if "height" in query_dic: height_min = int(query_dic["height"][:3]) height_max = int(query_dic["height"][3:]) should_list.append("height") if must_list: #plain text search fields query_body["query"]["bool"]["must"].append({"match":{"raw_content":" ".join(must_list)}}) else: #field search query_list = [] if "age" in query_dic: query_list.append("age") if "height" in query_dic: query_list.append("height") query_body["query"]["bool"]["must"].append({"match":{"raw_content":" ".join(query_list)}}) #should_arr = [] # for word in should_list: # dic = {} # dic["match"] = {} # dic["match"]["raw_content"] = word # should_arr.append(dic) #query_body["query"]["bool"]["should"] = should_arr if "phone" in query_dic: phone_number = re.sub("\D","",query_dic["phone"]) query_body["query"]["bool"]["must"].append({"match":{"phone":phone_number }}) if "age" in query_dic: query_body["query"]["bool"]["must"].append({"range" : {"age" : {"gte" : age_min,"lte" : age_max}}}) if "height" in query_dic: query_body["query"]["bool"]["must"].append({"range" : {"height" : {"gte" : height_min,"lte" : height_max}}}) if "hairColor" in query_dic: query_body["query"]["bool"]["must"].append({"match":{"hairColor":" ".join(query_dic["hairColor"].split(","))}}) if "eyeColor" in query_dic: query_body["query"]["bool"]["must"].append({"match":{"eyeColor":" ".join(query_dic["eyeColor"].split(","))}}) raw_content_str = query_body["query"]["bool"]["must"][0] if not raw_content_str["match"]["raw_content"]: #occurs when field search(phone,hairColor,eyeColor) is the only field involved query_body["query"]["bool"]["must"].pop(0) a = open("test.txt","w") a.write(str(query_body)) a.close() es = Elasticsearch(["localhost:9200/positiongt"],request_timeout=60) response = es.search(body=query_body,request_timeout=60) documents = response["hits"]["hits"] results = [] if not documents: hypoFields = [] if "hairColor" in query_dic: hypoFields.append("hairColor") if "eyeColor" in query_dic: hypoFields.append("eyeColor") is_raw_content = False if hypoFields: #if there is no results and hairColor or eyeColor included, transfer field search(originally hairColro and eyeColor are field search) to plain text search for term in hypoFields: j = -1 for i in range(len(query_body["query"]["bool"]["must"])): if "raw_content" in query_body["query"]["bool"]["must"][i]["match"]: query_body["query"]["bool"]["must"][i]["match"]["raw_content"] += " "+" ".join(query_dic[term].split(",")) is_raw_content = True if term in query_body["query"]["bool"]["must"][i]["match"]: j = i if j>=0: query_body["query"]["bool"]["must"].pop(j) #remove the field search if not is_raw_content: #this case occurs when field search are the only fields involved. query_body["query"]["bool"]["must"].insert(0,{"match":{"raw_content":" ".join(map(lambda x:" ".join(query_dic[x].split(",")),hypoFields))}}) response = es.search(body=query_body,request_timeout=60) documents = response["hits"]["hits"] if "ethnicity" in query_dic: f = open("nation_continent.txt") ethnicity_dic = yaml.load(f) candidate_countries = ethnicity_dic[query_dic["ethnicity"].lower()]+[query_dic["ethnicity"].capitalize()] for document in documents: if "ethnicity" in document["_source"] and document["_source"]["ethnicity"]: ethnicities = map(lambda x:x.lower(),document["_source"]["ethnicity"]) #print(ethnicities) if query_dic["ethnicity"].capitalize() in ethnicities: print(document["_id"]) results.append(document["_id"]) else: isMatch = False for eth_candi in ethnicities: if isMatch: break for coun_candi in candidate_countries: if fuzz.ratio(eth_candi,coun_candi.lower())>=80: print(document["_id"]) results.append(document["_id"]) isMatch = True break else: for document in documents: print document["_id"] results.append(document["_id"]) atn_db.cur.execute("SELECT round from search_list where topic_id=? ORDER BY round DESC LIMIT 1",[topicId]) res = atn_db.cur.fetchone() round = 0 if res: round, = res round += 1 for documentId in results: #print((None,topicId,round,documentId)) atn_db.cur.execute('INSERT INTO %s VALUES(%s)' %("search_list", "?,?,?,?"), (None,topicId,round,documentId)) atn_db.commit() atn_db.close()
def __init__(self, db_conn, url_file): """Initialize the crawler with a connection to the database to populate and with the file containing the list of seed URLs to begin indexing.""" self._url_queue = [] self._doc_id_cache = {} self._word_id_cache = {} self._url_list = {} self._word_list = {} self._inverted_index = {} self._resolved_inverted_index = {} self._link_list = [] self._db = DBHandler() # functions to call when entering and exiting specific tags self._enter = defaultdict(lambda *a, **ka: self._visit_ignore) self._exit = defaultdict(lambda *a, **ka: self._visit_ignore) # add a link to our graph, and indexing info to the related page self._enter["a"] = self._visit_a # record the currently indexed document's title an increase # the font size def visit_title(*args, **kargs): self._visit_title(*args, **kargs) self._increase_font_factor(7)(*args, **kargs) # increase the font size when we enter these tags self._enter["b"] = self._increase_font_factor(2) self._enter["strong"] = self._increase_font_factor(2) self._enter["i"] = self._increase_font_factor(1) self._enter["em"] = self._increase_font_factor(1) self._enter["h1"] = self._increase_font_factor(7) self._enter["h2"] = self._increase_font_factor(6) self._enter["h3"] = self._increase_font_factor(5) self._enter["h4"] = self._increase_font_factor(4) self._enter["h5"] = self._increase_font_factor(3) self._enter["title"] = visit_title # decrease the font size when we exit these tags self._exit["b"] = self._increase_font_factor(-2) self._exit["strong"] = self._increase_font_factor(-2) self._exit["i"] = self._increase_font_factor(-1) self._exit["em"] = self._increase_font_factor(-1) self._exit["h1"] = self._increase_font_factor(-7) self._exit["h2"] = self._increase_font_factor(-6) self._exit["h3"] = self._increase_font_factor(-5) self._exit["h4"] = self._increase_font_factor(-4) self._exit["h5"] = self._increase_font_factor(-3) self._exit["title"] = self._increase_font_factor(-7) # never go in and parse these tags self._ignored_tags = set( [ "meta", "script", "link", "meta", "embed", "iframe", "frame", "noscript", "object", "svg", "canvas", "applet", "frameset", "textarea", "style", "area", "map", "base", "basefont", "param", ] ) # set of words to ignore self._ignored_words = set( [ "", "the", "of", "at", "on", "in", "is", "it", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "and", "or", ] ) # TODO remove me in real version self._mock_next_doc_id = 1 self._mock_next_word_id = 1 # keep track of some info about the page we are currently parsing self._curr_depth = 0 self._curr_url = "" self._curr_doc_id = 0 self._font_size = 0 self._curr_words = None # get all urls into the queue try: with open(url_file, "r") as f: for line in f: self._url_queue.append((self._fix_url(line.strip(), ""), 0)) except IOError: pass
def setUp(self): self.db = DBHandler()
class crawler(object): """Represents 'Googlebot'. Populates a database by crawling and indexing a subset of the Internet. This crawler keeps track of font sizes and makes it simpler to manage word ids and document ids.""" def __init__(self, db_conn, url_file): """Initialize the crawler with a connection to the database to populate and with the file containing the list of seed URLs to begin indexing.""" self._url_queue = [] self._doc_id_cache = {} self._word_id_cache = {} self._url_list = {} self._word_list = {} self._inverted_index = {} self._resolved_inverted_index = {} self._link_list = [] self._db = DBHandler() # functions to call when entering and exiting specific tags self._enter = defaultdict(lambda *a, **ka: self._visit_ignore) self._exit = defaultdict(lambda *a, **ka: self._visit_ignore) # add a link to our graph, and indexing info to the related page self._enter["a"] = self._visit_a # record the currently indexed document's title an increase # the font size def visit_title(*args, **kargs): self._visit_title(*args, **kargs) self._increase_font_factor(7)(*args, **kargs) # increase the font size when we enter these tags self._enter["b"] = self._increase_font_factor(2) self._enter["strong"] = self._increase_font_factor(2) self._enter["i"] = self._increase_font_factor(1) self._enter["em"] = self._increase_font_factor(1) self._enter["h1"] = self._increase_font_factor(7) self._enter["h2"] = self._increase_font_factor(6) self._enter["h3"] = self._increase_font_factor(5) self._enter["h4"] = self._increase_font_factor(4) self._enter["h5"] = self._increase_font_factor(3) self._enter["title"] = visit_title # decrease the font size when we exit these tags self._exit["b"] = self._increase_font_factor(-2) self._exit["strong"] = self._increase_font_factor(-2) self._exit["i"] = self._increase_font_factor(-1) self._exit["em"] = self._increase_font_factor(-1) self._exit["h1"] = self._increase_font_factor(-7) self._exit["h2"] = self._increase_font_factor(-6) self._exit["h3"] = self._increase_font_factor(-5) self._exit["h4"] = self._increase_font_factor(-4) self._exit["h5"] = self._increase_font_factor(-3) self._exit["title"] = self._increase_font_factor(-7) # never go in and parse these tags self._ignored_tags = set( [ "meta", "script", "link", "meta", "embed", "iframe", "frame", "noscript", "object", "svg", "canvas", "applet", "frameset", "textarea", "style", "area", "map", "base", "basefont", "param", ] ) # set of words to ignore self._ignored_words = set( [ "", "the", "of", "at", "on", "in", "is", "it", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "and", "or", ] ) # TODO remove me in real version self._mock_next_doc_id = 1 self._mock_next_word_id = 1 # keep track of some info about the page we are currently parsing self._curr_depth = 0 self._curr_url = "" self._curr_doc_id = 0 self._font_size = 0 self._curr_words = None # get all urls into the queue try: with open(url_file, "r") as f: for line in f: self._url_queue.append((self._fix_url(line.strip(), ""), 0)) except IOError: pass # TODO remove me in real version def _mock_insert_document(self, uurl): """A function that pretends to insert a url into a document db table and then returns that newly inserted document's id.""" ret_id = self._mock_next_doc_id url = uurl.encode("ascii") self._url_list[ret_id] = url self._mock_next_doc_id += 1 return ret_id # TODO remove me in real version def _mock_insert_word(self, uword): """A function that pretends to inster a word into the lexicon db table and then returns that newly inserted word's id.""" ret_id = self._mock_next_word_id word = uword.encode("ascii") self._word_list[ret_id] = word self._mock_next_word_id += 1 return ret_id def _insert_pagerank(self): """Insert generated score for each page or link to database PageRank""" if len(self._link_list) > 0: _ranked_list = pagerank.page_rank(self._link_list) self._db.put_pageranks(_ranked_list) def word_id(self, word): """Get the word id of some specific word.""" if word in self._word_id_cache: return self._word_id_cache[word] # TODO: 1) add the word to the lexicon, if that fails, then the # word is in the lexicon # 2) query the lexicon for the id assigned to this word, # store it in the word id cache, and return the id. word_id = self._mock_insert_word(word) self._word_id_cache[word] = word_id return word_id def document_id(self, url): """Get the document id for some url.""" if url in self._doc_id_cache: return self._doc_id_cache[url] # TODO: just like word id cache, but for documents. if the document # doesn't exist in the db then only insert the url and leave # the rest to their defaults. doc_id = self._mock_insert_document(url) self._doc_id_cache[url] = doc_id return doc_id def _fix_url(self, curr_url, rel): """Given a url and either something relative to that url or another url, get a properly parsed url.""" rel_l = rel.lower() if rel_l.startswith("http://") or rel_l.startswith("https://"): curr_url, rel = rel, "" # compute the new url based on import curr_url = urlparse.urldefrag(curr_url)[0] parsed_url = urlparse.urlparse(curr_url) return urlparse.urljoin(parsed_url.geturl(), rel) def add_link(self, from_doc_id, to_doc_id): """Add a link into the database, or increase the number of links between two pages in the database.""" self._link_list.append((from_doc_id, to_doc_id)) def _visit_title(self, elem): """Called when visiting the <title> tag.""" title_text = self._text_of(elem).strip() print "document title=" + repr(title_text) # TODO update document title for document id self._curr_doc_id def _visit_a(self, elem): """Called when visiting <a> tags.""" dest_url = self._fix_url(self._curr_url, attr(elem, "href")) # print "href="+repr(dest_url), \ # "title="+repr(attr(elem,"title")), \ # "alt="+repr(attr(elem,"alt")), \ # "text="+repr(self._text_of(elem)) # add the just found URL to the url queue self._url_queue.append((dest_url, self._curr_depth)) # add a link entry into the database from the current document to the # other document self.add_link(self._curr_doc_id, self.document_id(dest_url)) # TODO add title/alt/text to index for destination url def _add_words_to_document(self): # TODO: knowing self._curr_doc_id and the list of all words and their # font sizes (in self._curr_words), add all the words into the # database for this document print " num words=" + str(len(self._curr_words)) def _increase_font_factor(self, factor): """Increade/decrease the current font size.""" def increase_it(elem): self._font_size += factor return increase_it def _visit_ignore(self, elem): """Ignore visiting this type of tag""" pass def _add_text(self, elem): """Add some text to the document. This records word ids and word font sizes into the self._curr_words list for later processing.""" words = WORD_SEPARATORS.split(elem.string.lower()) for word in words: word = word.strip() if word in self._ignored_words: continue self._curr_words.append((self.word_id(word), self._font_size)) def _text_of(self, elem): """Get the text inside some element without any tags.""" if isinstance(elem, Tag): text = [] for sub_elem in elem: text.append(self._text_of(sub_elem)) return " ".join(text) else: return elem.string def _index_document(self, soup): """Traverse the document in depth-first order and call functions when entering and leaving tags. When we come accross some text, add it into the index. This handles ignoring tags that we have no business looking at.""" class DummyTag(object): next = False name = "" class NextTag(object): def __init__(self, obj): self.next = obj tag = soup.html stack = [DummyTag(), soup.html] while tag and tag.next: tag = tag.next # html tag if isinstance(tag, Tag): if tag.parent != stack[-1]: self._exit[stack[-1].name.lower()](stack[-1]) stack.pop() tag_name = tag.name.lower() # ignore this tag and everything in it if tag_name in self._ignored_tags: if tag.nextSibling: tag = NextTag(tag.nextSibling) else: self._exit[stack[-1].name.lower()](stack[-1]) stack.pop() tag = NextTag(tag.parent.nextSibling) continue # enter the tag self._enter[tag_name](tag) stack.append(tag) # text (text, cdata, comments, etc.) else: self._add_text(tag) def crawl(self, depth=2, timeout=3): """Crawl the web!""" seen = set() while len(self._url_queue): url, depth_ = self._url_queue.pop() # skip this url; it's too deep if depth_ > depth: continue doc_id = self.document_id(url) # we've already seen this document if doc_id in seen: continue seen.add(doc_id) # mark this document as haven't been visited socket = None try: socket = urllib2.urlopen(url, timeout=timeout) soup = BeautifulSoup(socket.read()) self._curr_depth = depth_ + 1 self._curr_url = url self._curr_doc_id = doc_id self._font_size = 0 self._curr_words = [] self._index_document(soup) self._add_words_to_document() self._get_inverted_index_helper() print " url=" + repr(self._curr_url) except Exception as e: print e pass finally: if socket: socket.close() self._db.put_lexicon(self._word_list) self._db.put_inverted_index(self._inverted_index) self._insert_pagerank() self._db.put_doc_index(self._url_list) # self._db.close() def _get_inverted_index_helper(self): """ Helper furction to store inverted index in a dict(). Word id used as the key, and the list of document ids as the value where the list of document ids are stored in a set(). """ for key in self._curr_words: word_id = next(iter(key)) if word_id not in self._inverted_index: self._inverted_index[word_id] = set([self._curr_doc_id]) else: self._inverted_index[word_id].add(self._curr_doc_id) def get_inverted_index(self): return self._inverted_index def get_resolved_inverted_index(self): """ Word ids are replaced by the word strings, and the document Ids are replaced by URL strings in the inverted index. Return data in a dict(). """ for word_id in self._inverted_index: urls = set([]) for url_id in self._inverted_index[word_id]: urls.add(self._url_list[url_id]) word = self._word_list[word_id] self._resolved_inverted_index[word] = urls return self._resolved_inverted_index
def __init__(self, db_conn, url_file): """Initialize the crawler with a connection to the database to populate and with the file containing the list of seed URLs to begin indexing.""" self._url_queue = [] self._doc_id_cache = {} self._word_id_cache = {} self._url_list = {} self._word_list = {} self._inverted_index = {} self._resolved_inverted_index = {} self._link_list = [] self._db = DBHandler() # functions to call when entering and exiting specific tags self._enter = defaultdict(lambda *a, **ka: self._visit_ignore) self._exit = defaultdict(lambda *a, **ka: self._visit_ignore) # add a link to our graph, and indexing info to the related page self._enter['a'] = self._visit_a # record the currently indexed document's title an increase # the font size def visit_title(*args, **kargs): self._visit_title(*args, **kargs) self._increase_font_factor(7)(*args, **kargs) # increase the font size when we enter these tags self._enter['b'] = self._increase_font_factor(2) self._enter['strong'] = self._increase_font_factor(2) self._enter['i'] = self._increase_font_factor(1) self._enter['em'] = self._increase_font_factor(1) self._enter['h1'] = self._increase_font_factor(7) self._enter['h2'] = self._increase_font_factor(6) self._enter['h3'] = self._increase_font_factor(5) self._enter['h4'] = self._increase_font_factor(4) self._enter['h5'] = self._increase_font_factor(3) self._enter['title'] = visit_title # decrease the font size when we exit these tags self._exit['b'] = self._increase_font_factor(-2) self._exit['strong'] = self._increase_font_factor(-2) self._exit['i'] = self._increase_font_factor(-1) self._exit['em'] = self._increase_font_factor(-1) self._exit['h1'] = self._increase_font_factor(-7) self._exit['h2'] = self._increase_font_factor(-6) self._exit['h3'] = self._increase_font_factor(-5) self._exit['h4'] = self._increase_font_factor(-4) self._exit['h5'] = self._increase_font_factor(-3) self._exit['title'] = self._increase_font_factor(-7) # never go in and parse these tags self._ignored_tags = set([ 'meta', 'script', 'link', 'meta', 'embed', 'iframe', 'frame', 'noscript', 'object', 'svg', 'canvas', 'applet', 'frameset', 'textarea', 'style', 'area', 'map', 'base', 'basefont', 'param', ]) # set of words to ignore self._ignored_words = set([ '', 'the', 'of', 'at', 'on', 'in', 'is', 'it', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'and', 'or', ]) # TODO remove me in real version self._mock_next_doc_id = 1 self._mock_next_word_id = 1 # keep track of some info about the page we are currently parsing self._curr_depth = 0 self._curr_url = "" self._curr_doc_id = 0 self._font_size = 0 self._curr_words = None # get all urls into the queue try: with open(url_file, 'r') as f: for line in f: self._url_queue.append((self._fix_url(line.strip(), ""), 0)) except IOError: pass
class crawler(object): """Represents 'Googlebot'. Populates a database by crawling and indexing a subset of the Internet. This crawler keeps track of font sizes and makes it simpler to manage word ids and document ids.""" def __init__(self, db_conn, url_file): """Initialize the crawler with a connection to the database to populate and with the file containing the list of seed URLs to begin indexing.""" self._url_queue = [] self._doc_id_cache = {} self._word_id_cache = {} self._url_list = {} self._word_list = {} self._inverted_index = {} self._resolved_inverted_index = {} self._link_list = [] self._db = DBHandler() # functions to call when entering and exiting specific tags self._enter = defaultdict(lambda *a, **ka: self._visit_ignore) self._exit = defaultdict(lambda *a, **ka: self._visit_ignore) # add a link to our graph, and indexing info to the related page self._enter['a'] = self._visit_a # record the currently indexed document's title an increase # the font size def visit_title(*args, **kargs): self._visit_title(*args, **kargs) self._increase_font_factor(7)(*args, **kargs) # increase the font size when we enter these tags self._enter['b'] = self._increase_font_factor(2) self._enter['strong'] = self._increase_font_factor(2) self._enter['i'] = self._increase_font_factor(1) self._enter['em'] = self._increase_font_factor(1) self._enter['h1'] = self._increase_font_factor(7) self._enter['h2'] = self._increase_font_factor(6) self._enter['h3'] = self._increase_font_factor(5) self._enter['h4'] = self._increase_font_factor(4) self._enter['h5'] = self._increase_font_factor(3) self._enter['title'] = visit_title # decrease the font size when we exit these tags self._exit['b'] = self._increase_font_factor(-2) self._exit['strong'] = self._increase_font_factor(-2) self._exit['i'] = self._increase_font_factor(-1) self._exit['em'] = self._increase_font_factor(-1) self._exit['h1'] = self._increase_font_factor(-7) self._exit['h2'] = self._increase_font_factor(-6) self._exit['h3'] = self._increase_font_factor(-5) self._exit['h4'] = self._increase_font_factor(-4) self._exit['h5'] = self._increase_font_factor(-3) self._exit['title'] = self._increase_font_factor(-7) # never go in and parse these tags self._ignored_tags = set([ 'meta', 'script', 'link', 'meta', 'embed', 'iframe', 'frame', 'noscript', 'object', 'svg', 'canvas', 'applet', 'frameset', 'textarea', 'style', 'area', 'map', 'base', 'basefont', 'param', ]) # set of words to ignore self._ignored_words = set([ '', 'the', 'of', 'at', 'on', 'in', 'is', 'it', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'and', 'or', ]) # TODO remove me in real version self._mock_next_doc_id = 1 self._mock_next_word_id = 1 # keep track of some info about the page we are currently parsing self._curr_depth = 0 self._curr_url = "" self._curr_doc_id = 0 self._font_size = 0 self._curr_words = None # get all urls into the queue try: with open(url_file, 'r') as f: for line in f: self._url_queue.append((self._fix_url(line.strip(), ""), 0)) except IOError: pass # TODO remove me in real version def _mock_insert_document(self, uurl): """A function that pretends to insert a url into a document db table and then returns that newly inserted document's id.""" ret_id = self._mock_next_doc_id url = uurl.encode('ascii') self._url_list[ret_id] = url self._mock_next_doc_id += 1 return ret_id # TODO remove me in real version def _mock_insert_word(self, uword): """A function that pretends to inster a word into the lexicon db table and then returns that newly inserted word's id.""" ret_id = self._mock_next_word_id word = uword.encode('ascii') self._word_list[ret_id] = word self._mock_next_word_id += 1 return ret_id def _insert_pagerank(self): """Insert generated score for each page or link to database PageRank""" if len(self._link_list) > 0: _ranked_list = pagerank.page_rank(self._link_list) self._db.put_pageranks(_ranked_list) def word_id(self, word): """Get the word id of some specific word.""" if word in self._word_id_cache: return self._word_id_cache[word] # TODO: 1) add the word to the lexicon, if that fails, then the # word is in the lexicon # 2) query the lexicon for the id assigned to this word, # store it in the word id cache, and return the id. word_id = self._mock_insert_word(word) self._word_id_cache[word] = word_id return word_id def document_id(self, url): """Get the document id for some url.""" if url in self._doc_id_cache: return self._doc_id_cache[url] # TODO: just like word id cache, but for documents. if the document # doesn't exist in the db then only insert the url and leave # the rest to their defaults. doc_id = self._mock_insert_document(url) self._doc_id_cache[url] = doc_id return doc_id def _fix_url(self, curr_url, rel): """Given a url and either something relative to that url or another url, get a properly parsed url.""" rel_l = rel.lower() if rel_l.startswith("http://") or rel_l.startswith("https://"): curr_url, rel = rel, "" # compute the new url based on import curr_url = urlparse.urldefrag(curr_url)[0] parsed_url = urlparse.urlparse(curr_url) return urlparse.urljoin(parsed_url.geturl(), rel) def add_link(self, from_doc_id, to_doc_id): """Add a link into the database, or increase the number of links between two pages in the database.""" self._link_list.append((from_doc_id, to_doc_id)) def _visit_title(self, elem): """Called when visiting the <title> tag.""" title_text = self._text_of(elem).strip() print "document title=" + repr(title_text) # TODO update document title for document id self._curr_doc_id def _visit_a(self, elem): """Called when visiting <a> tags.""" dest_url = self._fix_url(self._curr_url, attr(elem, "href")) #print "href="+repr(dest_url), \ # "title="+repr(attr(elem,"title")), \ # "alt="+repr(attr(elem,"alt")), \ # "text="+repr(self._text_of(elem)) # add the just found URL to the url queue self._url_queue.append((dest_url, self._curr_depth)) # add a link entry into the database from the current document to the # other document self.add_link(self._curr_doc_id, self.document_id(dest_url)) # TODO add title/alt/text to index for destination url def _add_words_to_document(self): # TODO: knowing self._curr_doc_id and the list of all words and their # font sizes (in self._curr_words), add all the words into the # database for this document print " num words=" + str(len(self._curr_words)) def _increase_font_factor(self, factor): """Increade/decrease the current font size.""" def increase_it(elem): self._font_size += factor return increase_it def _visit_ignore(self, elem): """Ignore visiting this type of tag""" pass def _add_text(self, elem): """Add some text to the document. This records word ids and word font sizes into the self._curr_words list for later processing.""" words = WORD_SEPARATORS.split(elem.string.lower()) for word in words: word = word.strip() if word in self._ignored_words: continue self._curr_words.append((self.word_id(word), self._font_size)) def _text_of(self, elem): """Get the text inside some element without any tags.""" if isinstance(elem, Tag): text = [] for sub_elem in elem: text.append(self._text_of(sub_elem)) return " ".join(text) else: return elem.string def _index_document(self, soup): """Traverse the document in depth-first order and call functions when entering and leaving tags. When we come accross some text, add it into the index. This handles ignoring tags that we have no business looking at.""" class DummyTag(object): next = False name = '' class NextTag(object): def __init__(self, obj): self.next = obj tag = soup.html stack = [DummyTag(), soup.html] while tag and tag.next: tag = tag.next # html tag if isinstance(tag, Tag): if tag.parent != stack[-1]: self._exit[stack[-1].name.lower()](stack[-1]) stack.pop() tag_name = tag.name.lower() # ignore this tag and everything in it if tag_name in self._ignored_tags: if tag.nextSibling: tag = NextTag(tag.nextSibling) else: self._exit[stack[-1].name.lower()](stack[-1]) stack.pop() tag = NextTag(tag.parent.nextSibling) continue # enter the tag self._enter[tag_name](tag) stack.append(tag) # text (text, cdata, comments, etc.) else: self._add_text(tag) def crawl(self, depth=2, timeout=3): """Crawl the web!""" seen = set() while len(self._url_queue): url, depth_ = self._url_queue.pop() # skip this url; it's too deep if depth_ > depth: continue doc_id = self.document_id(url) # we've already seen this document if doc_id in seen: continue seen.add(doc_id) # mark this document as haven't been visited socket = None try: socket = urllib2.urlopen(url, timeout=timeout) soup = BeautifulSoup(socket.read()) self._curr_depth = depth_ + 1 self._curr_url = url self._curr_doc_id = doc_id self._font_size = 0 self._curr_words = [] self._index_document(soup) self._add_words_to_document() self._get_inverted_index_helper() print " url=" + repr(self._curr_url) except Exception as e: print e pass finally: if socket: socket.close() self._db.put_lexicon(self._word_list) self._db.put_inverted_index(self._inverted_index) self._insert_pagerank() self._db.put_doc_index(self._url_list) # self._db.close() def _get_inverted_index_helper(self): """ Helper furction to store inverted index in a dict(). Word id used as the key, and the list of document ids as the value where the list of document ids are stored in a set(). """ for key in self._curr_words: word_id = next(iter(key)) if word_id not in self._inverted_index: self._inverted_index[word_id] = set([self._curr_doc_id]) else: self._inverted_index[word_id].add(self._curr_doc_id) def get_inverted_index(self): return self._inverted_index def get_resolved_inverted_index(self): """ Word ids are replaced by the word strings, and the document Ids are replaced by URL strings in the inverted index. Return data in a dict(). """ for word_id in self._inverted_index: urls = set([]) for url_id in self._inverted_index[word_id]: urls.add(self._url_list[url_id]) word = self._word_list[word_id] self._resolved_inverted_index[word] = urls return self._resolved_inverted_index
def do_search(keywords): global user_top_20_database # Fetch the current session request_session = request.environ['beaker.session'] # Fetch the users email for their session user_email = request_session.get('user_email', 'Anonymous') if reduce(and_, map(lambda c: c in math_chars, keywords)): result = None try: result = eval( keywords.replace('^', '**').replace('[', '(').replace(']', ')')) return result_template( user_email, keywords, template(''' <p> {{keywords}} = {{result}} </p> ''', keywords=keywords, result=result)) except Exception as e: pass # A list of all keywords from the search query. keyword_list = map(str.lower, keywords.split()) keywords = keyword_list #----------------------------------------------------------------------- counted_keyword_list = [(keyword_list.count(x), x) for x in set(keyword_list)] # Sort the list in descending order of frequency. counted_keyword_list.sort(key=wordCount, reverse=1) page = request.query.get('page') if user_email <> 'anonymous' and page == None: # Fetch the top 20 list for that users email user_top_20 = user_top_20_database.get(user_email) if user_top_20 != None: # Add to the top 20 list and update totals. # Iterate through the counted keyword list. for keywords1 in counted_keyword_list: # If any keywords are already in the top 20 list, merge them into the top 20 list. if any(keywords1[1] in element for element in user_top_20): # Iterator to keep track of which keyword in the top 20 list we are at. i = 0 # Iterate through the keyword pairs and add the values from the counted_keyword_list into the top20 list. for keywords2 in user_top_20: # If the keywords match. if keywords2[1] == keywords1[1]: # Save the count value of the user_top_20 version. keyword_count = keywords2[0] # Delete the old user_top_20 keyword and count. del user_top_20[i] # Add the keyword with updated count to the front of the top_20 list. user_top_20.insert( 0, ((keywords1[0] + keyword_count), keywords1[1])) # Iterate i = i + 1 # If the word isn't already in the top 20 list add it. else: user_top_20.append(keywords1) # Organize the top 20 list in decending order by the frequency of a keyword. user_top_20.sort(key=wordCount, reverse=1) # Update the database of user search history user_top_20_database["user_email"] = user_top_20 # If the user_top_20 list is longer than 20 keywords, trim it. # while len(user_top_20) > 20: # del user_top_20[-1] #------------------------------------------------------------------------ # Grab the first keyword that was inputted by the user if keyword_list == []: results_list = [] return generate_page_results(1, results_list, [], user_email) if page == None: page = 1 else: page = int(page) db = DBHandler() # Get the word_ids through a getter in the database word_ids = [] ignored_words = set([ '', 'the', 'of', 'at', 'on', 'in', 'is', 'it', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'and', 'or', ]) for keyword in keyword_list: if keyword in ignored_words: continue word_ids.append(db.get_word_id(keyword)) # Get the doc_ids from the word_ids in the database list_of_doc_id_lists = [] for word_id in word_ids: if word_id == None: list_of_doc_id_lists.append([]) else: list_of_doc_id_lists.append(db.get_doc_ids(word_id)) # Find lists of doc_ids that intersect with each other, this will give us doc ids that contain both keywords intersecting_doc_ids = find_intersections(list_of_doc_id_lists) # Get the url_ranks from pagerank in the database ranks = db.get_pageranks(intersecting_doc_ids) # Zip the doc_ids with the corresponding url_ranks to make ranked_doc_ids ranked_doc_ids = zip(ranks, intersecting_doc_ids) # Sort the ranked_doc_ids to make sorted_doc_ids and get the sorted_urls from the database ranked_sorted_doc_ids = sorted(ranked_doc_ids, key=itemgetter(0)) results_list = map(itemgetter(0), db.get_urls(map(itemgetter(1), ranked_sorted_doc_ids))) return generate_page_results(page, results_list, keyword_list, user_email)
#from flask.ext.restful import Api, Resource, reqparse from flask_restful import Api, Resource, reqparse from flask_restful.utils import cors #from flask.ext.restful.utils import cors #from flask.ext.cors import CORS from flask_cors import CORS from modules.ssh import QoSHandler app = Flask(__name__) CORS(app) api = Api(app) config = json.load(open('./config.json', 'r')) db_handler = DBHandler(config) class UserAPI(Resource): def __init__(self): self.reqparse = reqparse.RequestParser() super(UserAPI, self).__init__() @cors.crossdomain(origin='*') def get(self, uid): users = [] if uid != 'all': users = uid.split(',') resultset = db_handler.get_users(users)