예제 #1
0
def manual_queries_topic_graphs(from_dataset, to_dataset) :

	db = MyMySQL(db=to_dataset)
	pub_ids = set(db.select("id", table="papers"))

	from_folder = config.DATA + "query_sets/" + from_dataset + "/manual/"
	to_folder = config.DATA + "query_sets/" + to_dataset + "/manual/"

	for file_name in os.listdir(from_folder) :

		print file_name
		from_file = open(from_folder + file_name, 'r')
		to_file = open(to_folder + file_name, 'w')

		# Read and write back header line
		header = from_file.readline().strip('\n') # ignore header
		print >> to_file, header

		for line in from_file :
			relev, pub_id, title = line.strip().split('\t')
			if (pub_id not in pub_ids) :
				pub_id = ''

			print >> to_file, "%s\t%s\t%s" %(relev, pub_id, title)

		from_file.close()
		to_file.close()
예제 #2
0
  def __init__(self, n=None):
    db = MyMySQL(db=config.DB_NAME,
                 user=config.DB_USER,
                 passwd=config.DB_PASSWD)

    rows = db.select(fields=["id", "title", "abstract"], table="papers")
    if n :
      rows = random.sample(rows, n)

    self.pubs = {str(id): (title, abs) for id, title, abs in rows}
예제 #3
0
def check_ids(folder) :

	db = MyMySQL(db='csx')

	for i in xrange(1,8) :
		print i
		print

		with open(folder + str(i) + ".txt") as file :

			_header_ = file.readline()
			for line in file :

				relev, pub_id, title = line.strip().split('\t')
				if (len(db.select("id", table="papers", where="id='%s'"%pub_id)) == 0) :
					print "Pub not found:", pub_id
예제 #4
0
def time_diversity(names, query_set) :


	# Get year of each paper for assembling personalization array next
	db = MyMySQL(db=config.DATASET)
	rows = db.select(["id", "year"], table="papers", where="year is not NULL and year between 1950 and 2013")
	years = {pub_id: year for pub_id, year in rows}

	for name in names :

		file_path = "%s/results/%s/%s/%s.p" % (config.DATA, config.DATASET, query_set, name)

		returned_years = []
		results = cPickle.load(open(file_path, 'r'))
		for _correct, _relevances, returned in results :
			for r in returned :
				if r in years :
					returned_years.append(years[r])

		print "%s\t%.2f\t%.2f" % (name, np.mean(returned_years), np.std(returned_years))
예제 #5
0
def fix_contexts_limits() :
  """
  Updates the contexts on the graph table so that the tokens on the
  extremities are removed. These are usually parts of words, and therefore
  are meaningless.
  """
  db = MyMySQL(db="csx", user="******", passwd="")
  ctxs = db.select(["citing", "cited", "context"], table="graph", where="context != ''")

  print len(ctxs)
  for citing, cited, ctx in progress(ctxs):
    s = ctx.find(" ")
    e = ctx.rfind(" ")

    # print ctx
    # print ctx[s+1:e]
    # print

    db.update(table="graph",
              set="context='%s'" % ctx[s+1:e],
              where="(citing='%s') AND (cited='%s')" % (citing, cited))
예제 #6
0
def keyword_centric(keyword, from_db, to_db):

    db = MyMySQL(db=from_db)
    pub_ids = db.select("paper_id",
                        table="keywords",
                        where="kw='%s'" % keyword)

    nodes = set()
    new_nodes = set()
    new_nodes.update(pub_ids)

    n = 50000
    while len(nodes) < n:

        new_nodes = get_next_hop(new_nodes)
        nodes.update(new_nodes)
        print len(nodes)

    print "Adding %d nodes." % len(nodes)

    new_db = MyMySQL(db=to_db)

    #	values = ','.join(['%s'%id for id in nodes])
    new_db.insert(into="use_papers", fields=["paper_id"], values=list(nodes))
예제 #7
0
    def search(self, query, exclude=[], limit=20, force=False):

        # import warnings
        # warnings.filterwarnings('error')

        file_path = config.CITERANK_FILE_PATH
        if not os.path.exists(file_path):
            g = nx.DiGraph()
            g.add_edges_from(model.get_all_edges())

            # Remove documents from the exclude list
            g.remove_nodes_from(exclude)

            # Get year of each paper for assembling personalization array next
            db = MyMySQL(db=config.DATASET)
            rows = db.select(["id", "year"], table="papers")
            years = {}
            for pub_id, year in rows:
                if year is not None:
                    years[pub_id] = year

            # Calculate the median to use in the missing values
            year_median = np.median(years.values())

            # Create a personalization array by exponentially decaying
            # each paper's factor by its age
            pers = {}
            for node in g.nodes():
                if (node not in years) or (years[node] < 1960) or (years[node]
                                                                   > 2013):
                    years[node] = year_median

                pers[node] = np.exp(float(years[node] - 2013) / self.tau)
            #				try :
            #				except Warning:
            #					print "Warning!"
            #					print node
            #					print year
            #					print

            print "Running PageRank with %d nodes and age defined personalization vector." % g.number_of_nodes(
            )
            r = nx.pagerank(g, personalization=pers)

            print "Writing results"
            cPickle.dump(r, open(file_path, "w"))

        # Loads cached page rank values for every node
        r = cPickle.load(open(file_path, "r"))

        # Sorts documents decreasingly by page rank value
        ids, _score_ = zip(
            *sorted(r.items(), key=lambda (k, v): v, reverse=True))

        # Fetches all document that have at least one of the terms.
        # Store them in a set for fast lookup
        pub_ids = self.index.search(query,
                                    search_fields=["title", "abstract"],
                                    return_fields=["id"],
                                    ignore=exclude)

        pub_ids = set([pid for (pid, ) in pub_ids])

        results = []
        for id in ids:
            if id in pub_ids:
                results.append(id)
                if len(results) == limit:
                    break

        return results
예제 #8
0
class Downloader():
    def __init__(self):
        '''
		Stores the process id and creates a task manager to get 
		and update tasks.
		'''
        # Zeno task manager
        self.tasks = zeno.TasksManager("tasks",
                                       host=config.DB_HOST,
                                       user=config.DB_USER,
                                       passwd=config.DB_PASSWD)

        # Database connection
        self.db = MyMySQL(db=config.DB_NAME,
                          host=config.DB_HOST,
                          user=config.DB_USER,
                          passwd=config.DB_PASSWD)

        # Logging configuration
        self.log = utils.config_logging(
            'downloader',
            stream=sys.stdout,
            level=logging.DEBUG,
            format='%(asctime)s (%(name)s) [%(levelname)6s]: %(message)s',
            datefmt="%Y-%m-%d %H:%M:%S")

    def parse_error(self, content):
        '''
		Parsers the returned response's HTML and throws the appropriate exception.
		'''
        if content.find("Download Limit Exceeded"):
            raise LimitReachedException()
        else:
            raise Exception()

    def make_csx_url(self, id):
        return "http://citeseerx.ist.psu.edu/viewdoc/download?doi=%s&rep=rep1&type=pdf" % id

    def download_from_csx(self, paper_id):
        ''' 
		Downloads the given image URL. 
		'''

        # Get url from the database
        url = "http://citeseerx.ist.psu.edu/viewdoc/download?doi=%s&rep=rep1&type=pdf" % paper_id

        headers = {'User-Agent': 'Chrome/34.0.1847.116 (X11; Linux x86_64)'}
        response = requests.get(url, headers=headers)

        if (response.status_code != 200):
            raise RequestException("%d: %s" %
                                   (response.status_code, response.reason))

        if response.headers['Content-Type'].startswith('text/html'):
            self.parse_error(response.content)

        # Save file to the local disk
        file_path = os.path.join(self.data_folder, "%s.pdf" % paper_id)
        img_file = open(file_path, "wb")
        img_file.write(response.content)
        img_file.close()

    def get_all_urls(self, paper_id):
        ''' Returns the external paper URL if available. '''

        cluster_id = self.db.select_one("cluster",
                                        table="papers",
                                        where="id='%s'" % paper_id)

        alt_paper_ids = self.db.select("id",
                                       table="papers",
                                       where="cluster=%d" % cluster_id)

        urls = []
        for altern_id in alt_paper_ids:
            urls = urls + [self.make_csx_url(altern_id)]

            other_urls = self.db.select("url",
                                        table="urls",
                                        where="paperid='%s'" % altern_id)
            urls = other_urls + urls

        return urls

    def download(self, paper_id):
        ''' 
		Downloads the given image URL. 
		'''
        headers = {'User-Agent': 'Chrome/34.0.1847.116 (X11; Linux x86_64)'}

        # Get url from the database
        urls = self.get_all_urls(paper_id)
        for url in urls:

            # Only supports PDF for now
            if url[-3:].lower() != "pdf":
                continue

            try:
                response = requests.get(url, headers=headers)
            except ConnectionError:
                self.log.warn("Connection error! Ignoring URL '%s'" % (url))
                continue

            response_type = response.headers['Content-Type']

            if response_type.startswith('text/html'):
                if response.content.find("Download Limit Exceeded") >= 0:
                    raise LimitReachedException()
                else:
                    continue

            if (response.status_code != 200) or (response_type !=
                                                 "application/pdf"):
                continue

# 				raise MissingURLException()
# 			if (response.status_code != 200) :
# 				raise RequestException("%d: %s" % (response.status_code, response.reason))

# Save file to the local disk
            file_path = config.PDF_PATH % paper_id
            img_file = open(file_path, "wb")
            img_file.write(response.content)
            img_file.close()

            # Download successfully completed
            return True

        # If we got here, no valid URL was found
        return False

    def run(self):

        self.log.info("Starting %s." % os.getpid())

        # Keep running until a stop file is found
        while (not os.path.exists("stop")):

            try:
                paper_id = self.tasks.get_next("START")

                if not self.download(paper_id):
                    raise DownloadException("Could not download paper '%s'." %
                                            paper_id)

                # Update the task status and the disk in which the file was saved.
                self.tasks.update_success(paper_id, "DOWNLOADED")

                # Everything went OK if got here
                self.log.info("%s: OK" % paper_id)

            # Nothing to collect
            except NothingToProcessException:
                self.log.error("Nothing to process.")
                break

            except LimitReachedException:
                self.log.error("Request limit reached!! Waiting...")
                self.tasks.update_release(
                    paper_id, "Request limit reached. Will try again later.")
                time.sleep(60 * 60)

            # URL missing in the DB or not returning the resource.
            except DownloadException, e:
                self.log.error("%s: FAIL" % (paper_id))
                self.tasks.update_error(paper_id, message=str(e))

            # Request errors
# 			except RequestException, e:
# 				self.log("%s: %s\n%s" % (paper_id, e.msg, traceback.format_exc()), show=True)
# 				self.db.update_status(paper_id, DBManager.DOWNLOAD_ERROR)

# Any other exception we log the traceback, update the DB and life goes on
            except Exception, e:
                self.log.error("%s: FAIL: %s" %
                               (paper_id, traceback.format_exc()))
                self.tasks.update_error(paper_id, message=str(e))