Exemplo n.º 1
0
def load_contexts():

  db_csx = MyMySQL(db="csx", user="******", passwd="")
  db_cg = MyMySQL(db="csx_citegraph", user="******", passwd="")

  pubs = get_pubs(db_csx)
  print "%d publications loaded." % len(pubs)

  clusters = {str(pub_id): cluster for pub_id, cluster in pubs}

  citations = get_citations(db_csx)
  print "%d citations loaded." % len(citations)

  found = 0
  for n, (citing, cited) in enumerate(citations):
    cciting = clusters[str(citing)]
    ccited  = clusters[str(cited)]
    context = get_context(db_cg, cciting, ccited)

    if context is None:
      context = ''
    else:
      context = context.replace("'", '"')
      found += 1

    # if (context is not None) and (context != ""):
    try:
      update_graph(db_csx, citing, cited, context)
    except:
      print "Exception when updating 'graph' table."


    print "%d out of %d contexts found." % (found, n + 1)
Exemplo n.º 2
0
def write_surveys_queries_file(prefix, npubs=110) :

	db = MyMySQL(db=config.DB_NAME)
	candidates = db.select_query('''SELECT id, substring(title,1,140), year
																	FROM papers
																	WHERE title LIKE '%survey%' AND (year IS NOT NULL)
																	AND (year BETWEEN 1950 AND 2014)''')

	print "Candidates: %s" % len(candidates)

	# Include the word 'survey' for this particular case
	_stop_words_.add("survey")

	# Write candidates to file
	file = open(prefix + ".txt", "w")

	n = 0
	for pub_id, title, year in candidates :

		citations = utils.get_cited(db, pub_id)
		if len(citations)>=20 :
			query = to_query(title)

			print >> file, "%s\t%d\t%s\t%s" % (pub_id, year, title.strip(), query)

			n += 1
			if (n >= npubs) :
				break

	file.close()
Exemplo n.º 3
0
def get_stats(dataset) :
	
	db = MyMySQL(db=dataset)

	kw_table = 'doc_ngrams' if (dataset=='aminer') else 'doc_kws'
	
	npubs = db.select_query("select count(*) from papers")[0][0]
	nauthors = db.select_query("select count(distinct author_id) from authorships")[0][0]
	nkws = db.select_query("select count(distinct ngram) from %s" % kw_table)[0][0]
	nvenues = db.select_query("select count(distinct venue_id) from papers")[0][0]

	pubs_pubs    = db.select_query("select count(*) from graph")[0][0]
	auths_auths  = db.select_query("select count(*) from coauthorships")[0][0]
	pubs_authors = db.select_query("select count(*) from authorships")[0][0]
	pubs_kws     = db.select_query("select count(*) from %s where value>=%f" % (kw_table, config.MIN_NGRAM_TFIDF))[0][0]
	
#	npubs    = 1
#	nauthors = 2
#	nkws     = 3
#	nvenues  = 4
#	pubs_pubs    = 1
#	auths_auths  = 4
#	pubs_authors = 2
#	pubs_kws     = 3

	
	print "\\hline"	
	print "\\multicolumn{4}{|c|}{%s} \\\\" % TEX_NAMES[dataset]
	print "\\hline"
	print "pubs ($N_p$) & %d & pubs-pubs & %d \\\\" % (npubs, pubs_pubs)
	print "authors   & %d & authors-authors & %d  \\\\" % (nauthors, auths_auths)
	print "keywords ($N_k$)  & %d  & pubs-keywords   & %d \\\\" % (nkws, pubs_kws)
	print "venues ($N_v$)    & %d     & pubs-authors  & %d \\\\" % (nvenues, pubs_authors)
Exemplo n.º 4
0
def manual_queries_topic_graphs(from_dataset, to_dataset) :

	db = MyMySQL(db=to_dataset)
	pub_ids = set(db.select("id", table="papers"))

	from_folder = config.DATA + "query_sets/" + from_dataset + "/manual/"
	to_folder = config.DATA + "query_sets/" + to_dataset + "/manual/"

	for file_name in os.listdir(from_folder) :

		print file_name
		from_file = open(from_folder + file_name, 'r')
		to_file = open(to_folder + file_name, 'w')

		# Read and write back header line
		header = from_file.readline().strip('\n') # ignore header
		print >> to_file, header

		for line in from_file :
			relev, pub_id, title = line.strip().split('\t')
			if (pub_id not in pub_ids) :
				pub_id = ''

			print >> to_file, "%s\t%s\t%s" %(relev, pub_id, title)

		from_file.close()
		to_file.close()
Exemplo n.º 5
0
    def __init__(self):

        # Zeno task manager
        self.tasks = zeno.TasksManager("tasks",
                                       host=config.DB_HOST,
                                       user=config.DB_USER,
                                       passwd=config.DB_PASSWD)

        # Database connection
        self.db = MyMySQL(db=config.DB_NAME,
                          host=config.DB_HOST,
                          user=config.DB_USER,
                          passwd=config.DB_PASSWD)

        # Logging configuration
        self.log = utils.config_logging(
            'tokenizer',
            stream=sys.stdout,
            level=logging.DEBUG,
            format='%(asctime)s (%(name)s) [%(levelname)6s]: %(message)s',
            datefmt="%Y-%m-%d %H:%M:%S")

        self.MIN_TOKENS = 10

        # Create folders with non existing
        utils.ensure_folder(os.path.dirname(config.TOKENS_PATH))
        utils.ensure_folder(os.path.dirname(config.TOKENS_PATH_PARTS))
Exemplo n.º 6
0
def get_cited_papers(doc_id) :

	db = MyMySQL(db=DB_NAME, user=DB_USER, passwd=DB_PASSWD)

	return db.select_query("""SELECT r.cited_paper_id, g.start, g.end 
														FROM citations c 
														JOIN citation_groups g ON c.group_id = g.id 
														JOIN refs r ON c.ref_id=r.id 
														WHERE c.paper_id='%s' AND r.cited_paper_id IS NOT NULL""" % doc_id)
Exemplo n.º 7
0
  def __init__(self, n=None):
    db = MyMySQL(db=config.DB_NAME,
                 user=config.DB_USER,
                 passwd=config.DB_PASSWD)

    rows = db.select(fields=["id", "title", "abstract"], table="papers")
    if n :
      rows = random.sample(rows, n)

    self.pubs = {str(id): (title, abs) for id, title, abs in rows}
Exemplo n.º 8
0
def main(argv):
    query = None
    usr = None
    output_file = None
    pwd = None
    n = 20

    try:
        opts, _args_ = getopt.getopt(argv, "hq:o:n:u:p:")
    except getopt.GetoptError:
        usage()
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            sys.exit()

        elif opt == "-q":
            query = arg

        elif opt == "-o":
            output_file = arg

        elif opt == "-n":
            n = int(arg)

        elif opt == "-u":
            usr = arg

        elif opt == "-p":
            pwd = arg

        else:
            print "Invalid option: %s" % opt

    # Check mandatory arguments
    if (not query or not usr or not pwd):
        usage()
        sys.exit(2)

    s = searchers.Searcher(**config.PARAMS)
    pub_ids = s.search(query, limit=n)

    if not output_file:
        output_file = utils.get_graph_file_name(query)

    # Writes the graph structure as a gexf file
    nx.write_gexf(s.graph, output_file)

    # Prints the results
    db = MyMySQL(db='csx', user=usr, passwd=pwd)
    for id in pub_ids:
        print "%12s\t %s" % (
            id, db.select_one("title", table="papers", where="id='%s'" % id))
Exemplo n.º 9
0
    def __init__(self):
        self.index = Index(config.INDEX_PATH)

        # Get citation counts and store into dict for fast lookup
        db = MyMySQL(db=config.DB_NAME,
                     user=config.DB_USER,
                     passwd=config.DB_PASSWD)

        ncitations = db.select_query(
            "SELECT cited, COUNT(*) from graph GROUP BY cited")
        self.ncitations = dict(ncitations)
    def __init__(self, ec2_manager, ec2_instance_id, ec2_instance_dns):
        '''
		Since this is run on the main process, it shouldn't
		open connection or file descriptors.
		'''
        # Zeno task manager
        self.tasks = zeno.TasksManager("tasks",
                                       host=config.DB_HOST,
                                       user=config.DB_USER,
                                       passwd=config.DB_PASSWD)

        # Database connection
        self.db = MyMySQL(db=config.DB_NAME,
                          host=config.DB_HOST,
                          user=config.DB_USER,
                          passwd=config.DB_PASSWD)

        # EC2 manager to issue commands.
        self.ec2_manager = ec2_manager

        # EC2 instance information to be used as a proxy.
        self.ec2_instance_id = ec2_instance_id
        self.ec2_instance_dns = ec2_instance_dns

        # Logging configuration
        self.log = utils.config_logging(
            'downloader',
            stream=sys.stdout,
            level=logging.DEBUG,
            format='%(asctime)s (%(name)s) [%(levelname)6s]: %(message)s',
            datefmt="%Y-%m-%d %H:%M:%S")
Exemplo n.º 11
0
def check_ids(folder) :

	db = MyMySQL(db='csx')

	for i in xrange(1,8) :
		print i
		print

		with open(folder + str(i) + ".txt") as file :

			_header_ = file.readline()
			for line in file :

				relev, pub_id, title = line.strip().split('\t')
				if (len(db.select("id", table="papers", where="id='%s'"%pub_id)) == 0) :
					print "Pub not found:", pub_id
Exemplo n.º 12
0
def find_ids_unsupervised(titles, index_folder):

    db = MyMySQL(db='csx')
    index = Index(index_folder)

    found = 0
    doc_ids = []
    for title in titles:
        top_docs, scores = index.search(title,
                                        search_fields=["title"],
                                        return_fields=["id"],
                                        return_scores=True,
                                        limit=5)
        #		ids = index.get_documents(top_docs, fields="id")

        # To decide if the most similar title in the index is a hit we check if its score
        # is significantly higher than those of the hits that follow it (second to sixth)

        if len(scores) > 2 and (scores[0] > 2 * np.mean(scores[1:])):
            doc_ids.append(top_docs[0][0])
            found += 1
        else:
            doc_ids.append("")

        # Only enable for debugging and finding a threshold
        if 0:
            print "-------"
            print "%s" % (title)
            print "-------"
            for i, (id, ) in enumerate(top_docs):
                title = db.select_one("title",
                                      table="papers",
                                      where="id='%s'" % id)
                print "%.2f\t%s" % (scores[i], title.encode("UTF-8"))

            if (scores[0] > 2 * np.mean(scores[1:])):
                print "Found!",
                op = '>'
            else:
                print "Not found!",
                op = '<'

            print "(%.2f %s %.2f)\n" % (scores[0], op, 2 * np.mean(scores[1:]))

    return doc_ids
Exemplo n.º 13
0
def get_citing_papers(doc_id) :
	
	db = MyMySQL(db=DB_NAME, user=DB_USER, passwd=DB_PASSWD)
	
	query = """SELECT r.paper_id, 
										cg.start, cg.end 
										FROM refs r 
										JOIN citations c ON r.id=c.ref_id 
										JOIN citation_groups cg ON c.group_id=cg.id 
										WHERE cited_paper_id='%s' """ % doc_id
	rows = db.select_query(query)

	# Group citations by paper
	citations = defaultdict(list)
	for citing_paper, start, end in rows :
		citations[citing_paper].append((start, end))

	return citations
Exemplo n.º 14
0
def time_diversity(names, query_set) :


	# Get year of each paper for assembling personalization array next
	db = MyMySQL(db=config.DATASET)
	rows = db.select(["id", "year"], table="papers", where="year is not NULL and year between 1950 and 2013")
	years = {pub_id: year for pub_id, year in rows}

	for name in names :

		file_path = "%s/results/%s/%s/%s.p" % (config.DATA, config.DATASET, query_set, name)

		returned_years = []
		results = cPickle.load(open(file_path, 'r'))
		for _correct, _relevances, returned in results :
			for r in returned :
				if r in years :
					returned_years.append(years[r])

		print "%s\t%.2f\t%.2f" % (name, np.mean(returned_years), np.std(returned_years))
Exemplo n.º 15
0
def write_surveys_queries(n=110) :

	db = MyMySQL(db=config.DB_NAME)

	if not os.path.exists(config.QUERY_SETS_PATH) :
		os.mkdir(config.QUERY_SETS_PATH)

	prefix = config.QUERY_SETS_PATH + "surveys"

#	write_surveys_queries_file(prefix, n)
	write_query_set_folder(db, prefix)
Exemplo n.º 16
0
def fix_contexts_limits() :
  """
  Updates the contexts on the graph table so that the tokens on the
  extremities are removed. These are usually parts of words, and therefore
  are meaningless.
  """
  db = MyMySQL(db="csx", user="******", passwd="")
  ctxs = db.select(["citing", "cited", "context"], table="graph", where="context != ''")

  print len(ctxs)
  for citing, cited, ctx in progress(ctxs):
    s = ctx.find(" ")
    e = ctx.rfind(" ")

    # print ctx
    # print ctx[s+1:e]
    # print

    db.update(table="graph",
              set="context='%s'" % ctx[s+1:e],
              where="(citing='%s') AND (cited='%s')" % (citing, cited))
Exemplo n.º 17
0
def get_texts(pub_ids, use_title=True, use_abs=True) :
  '''
  This is a non-batch version. Much slower but more
  memory efficient.
  '''
  db = MyMySQL(db='csx', user='******', passwd='')

  fields = []
  if use_title: fields.append("title")
  if use_abs: fields.append("abstract")

  texts = []
  for pub_id in pub_ids:
    text_fields = db.select_one(fields=fields, table="papers", where="id='%s'" % pub_id)
    text = ''
    for tf in text_fields:
      if tf is not None:
        text += tf

    texts.append(text)

  return texts
Exemplo n.º 18
0
def write_citations_queries(name1, n1, name2, n2) :

	db = MyMySQL(db=config.DB_NAME)

	if not os.path.exists(config.QUERY_SETS_PATH) :
		os.mkdir(config.QUERY_SETS_PATH)

	path1 = config.QUERY_SETS_PATH + name1
	path2 = config.QUERY_SETS_PATH + name2

#	write_citations_query_set_files(db, path1, n1, path2, n2)

	write_query_set_folder(db, path1)
	write_query_set_folder(db, path2)
Exemplo n.º 19
0
    def __init__(self):
        '''
		Stores the process id and creates a task manager to get 
		and update tasks.
		'''
        # Zeno task manager
        self.tasks = zeno.TasksManager("tasks",
                                       host=config.DB_HOST,
                                       user=config.DB_USER,
                                       passwd=config.DB_PASSWD)

        # Database connection
        self.db = MyMySQL(db=config.DB_NAME,
                          host=config.DB_HOST,
                          user=config.DB_USER,
                          passwd=config.DB_PASSWD)

        # Logging configuration
        self.log = utils.config_logging(
            'downloader',
            stream=sys.stdout,
            level=logging.DEBUG,
            format='%(asctime)s (%(name)s) [%(levelname)6s]: %(message)s',
            datefmt="%Y-%m-%d %H:%M:%S")
Exemplo n.º 20
0
def get_layer_results(queries, searcher, folder, layer) :

	db = MyMySQL(db=config.DATASET)

	def get_pub(pub_id) :
		return db.select_one("title", table="papers", where="id='%s'" % pub_id)

	def get_author(author_id) :
		return db.select_one("name", table="authors", where="cluster=%s" % author_id)

	def get_venue(venue_id):
		abbrev, name = db.select_one(["abbrev", "name"], table="venues", where="id=%s" % venue_id)
		return " ".join((abbrev, name)).strip()

	def get_keyword(kw):
		return kw

	# Create the folder that will hold the results for this layer
	if not os.path.exists(folder) :
		os.makedirs(folder)

	print "\n%s" % folder

	# Each layer has a different handler to get the name of the entity
	get_entities = {'paper': get_pub,
									'author': get_author,
									'venue': get_venue,
									'ngram': get_keyword}

	# Now fetch the results and save them
	for query in queries :

		file_path = os.path.join(folder, query.replace(' ', '+') + ".txt")
		print " ", query

		entity_ids = searcher.search(query, rtype=layer, limit=50)
		with open(file_path, 'w') as file :
			for eid in entity_ids:
				name = get_entities[layer](eid).strip()
				print >> file, "%s" % (name.encode("UTF-8"))
Exemplo n.º 21
0
def keyword_centric(keyword, from_db, to_db):

    db = MyMySQL(db=from_db)
    pub_ids = db.select("paper_id",
                        table="keywords",
                        where="kw='%s'" % keyword)

    nodes = set()
    new_nodes = set()
    new_nodes.update(pub_ids)

    n = 50000
    while len(nodes) < n:

        new_nodes = get_next_hop(new_nodes)
        nodes.update(new_nodes)
        print len(nodes)

    print "Adding %d nodes." % len(nodes)

    new_db = MyMySQL(db=to_db)

    #	values = ','.join(['%s'%id for id in nodes])
    new_db.insert(into="use_papers", fields=["paper_id"], values=list(nodes))
Exemplo n.º 22
0
'''
Created on Jun 29, 2015

@author: luamct
'''
from mymysql.mymysql import MyMySQL
import random
import networkx as nx
from utils import progress

db = MyMySQL(db='csx')


def get_cited(pub_id):
    return db.select("cited", table="graph", where="citing='%s'" % pub_id)


def get_citing(pub_id):
    return db.select("citing", table="graph", where="cited='%s'" % pub_id)


def get_neighbours(pub_id):
    return db.select(["citing", "cited"],
                     table="graph",
                     where="citing='%s' OR cited='%s'" % (pub_id, pub_id))


def depth_walk():

    ids = db.select("id", table="papers", limit=10000)
Exemplo n.º 23
0
class Tokenizer():
    def __init__(self):

        # Zeno task manager
        self.tasks = zeno.TasksManager("tasks",
                                       host=config.DB_HOST,
                                       user=config.DB_USER,
                                       passwd=config.DB_PASSWD)

        # Database connection
        self.db = MyMySQL(db=config.DB_NAME,
                          host=config.DB_HOST,
                          user=config.DB_USER,
                          passwd=config.DB_PASSWD)

        # Logging configuration
        self.log = utils.config_logging(
            'tokenizer',
            stream=sys.stdout,
            level=logging.DEBUG,
            format='%(asctime)s (%(name)s) [%(levelname)6s]: %(message)s',
            datefmt="%Y-%m-%d %H:%M:%S")

        self.MIN_TOKENS = 10

        # Create folders with non existing
        utils.ensure_folder(os.path.dirname(config.TOKENS_PATH))
        utils.ensure_folder(os.path.dirname(config.TOKENS_PATH_PARTS))

    def save_tokens(self, tokens, tok_file):
        counter = Counter(tokens)
        with open(tok_file, 'w') as f:
            # 			print >> f, (' '.join(tokens)).encode("utf-8")
            lines = [
                "%s %d" % (token, count) for (token, count) in counter.items()
            ]
            print >> f, '\n'.join(lines).encode("UTF-8")

    def get_section(self, html_file, possible_section_names,
                    possible_next_sections):

        # Open and parse HTML, then extract all textual content from each paragraph
        h = html.parse(html_file)  #, parser=etree.XMLParser(encoding="utf-8"))
        pars = [
            paragraph.text_content().lower().encode("UTF-8")
            for paragraph in h.xpath("//p")
        ]  # .encode("utf-8")

        # First we go backwards trying to find the latest occurrence of
        # one of the possible names of the section of interest
        begin = None
        for i in reversed(xrange(len(pars))):
            if match_any(pars[i], possible_section_names):
                begin = i
                break

        # If the start wasn't found, just halt right away
        if (begin is None):
            return ""

        # Otherwise we can look for the end of the section starting from the start
        # of the found section.
        end = None
        for j in xrange(begin + 1, len(pars)):
            if match_any(pars[j], possible_next_sections):
                end = j
                break

        # End of section not found, so it's not safe to keep this content,
        # so we return an empty string.
        if (end is None):
            return ""

        # Otherwise join all paragraphs inside the section found
        return unicode("".join([fix_hyphens(p) for p in pars[begin:end]]),
                       "UTF-8")

    def get_title_and_abstract(self, paper_id):
        title, abstract = self.db.select_one(["title", "abstract"],
                                             table="papers",
                                             where="id='%s'" % paper_id)
        if title is None: title = ""
        if abstract is None: abstract = ""

        return title, abstract

    def process_full_text(self, paper_id):
        '''
		Tokenizes and store in disk the full text of the document provided.
		'''
        txt_file = config.TXT_PATH % paper_id
        tok_file = config.TOKENS_PATH % paper_id

        with open(txt_file, 'r') as f:
            text = unicode(f.read(), "utf-8")

        tokens = utils.tokenize(text)
        if (len(tokens) < self.MIN_TOKENS):
            raise MinimumTokensException(
                '''Minimum number of tokens (%d) could not be extracted. 
			 				Document is likely to be badly encoded.''' % self.MIN_TOKENS)

        self.save_tokens(tokens, tok_file)

    def process_important_parts(self, paper_id):
        '''
		Tokenizes some specific parts of the document deemed as important, like
		the title, abstract and conclusion.
		'''
        html_file = config.HTML_PATH % paper_id
        tokens_file = config.TOKENS_PATH_PARTS % paper_id

        # Get title and abstract from DB
        title, abstract = self.get_title_and_abstract(paper_id)

        # Get conclusion from full text
        conclusion = self.get_section(
            html_file, ['conclusion', 'concluding', 'summary'],
            ['reference', 'bibliography', 'acknowledg', 'appendix'])

        # Uncomment if you don't want to use the abstract from the DB
        #		abstract = self.get_section(html_file, ['abstract'], ['categories', 'keywords', 'introduction'])

        # Tokenize each part and save into a file
        tokens = []
        tokens += utils.tokenize(title)
        tokens += utils.tokenize(abstract)
        tokens += utils.tokenize(conclusion)

        if (len(tokens) < self.MIN_TOKENS):
            raise MinimumTokensException(
                ("Minimum number of tokens (%d) could not be extracted." %
                 self.MIN_TOKENS) +
                "Document is likely to have decoding problems.")

        self.save_tokens(tokens, tokens_file)

    def run(self):

        self.log.info("Starting process %d" % os.getpid())

        # Keep running until a stop file is found
        while (not os.path.exists("stop")):

            try:
                paper_id = self.tasks.get_next("CONVERTED")

                # Pre-processes the full text and only the important parts to different folders
                self.process_full_text(paper_id)
                self.process_important_parts(paper_id)

                # Update the task status and the disk in which the file was saved.
                self.tasks.update_success(paper_id, "TOKENIZED")

                # Everything went OK if got here
                self.log.info("%s: OK" % paper_id)

            # Nothing to collect
            except NothingToProcessException:
                self.log.info("Nothing to process.")
                break

            except MinimumTokensException, e:
                self.log.error("%s: FAIL\n%s\n" %
                               (paper_id, traceback.format_exc()))
                self.tasks.update_error(paper_id, message=str(e))

            # Any other exception we log the traceback and update the DB
            except Exception:
                self.log.error("%s: FAIL\n%s\n" %
                               (paper_id, traceback.format_exc()))
                self.tasks.update_error(paper_id, "TOKENIZE_ERROR")
Exemplo n.º 24
0
    def search(self, query, exclude=[], limit=20, force=False):

        # import warnings
        # warnings.filterwarnings('error')

        file_path = config.CITERANK_FILE_PATH
        if not os.path.exists(file_path):
            g = nx.DiGraph()
            g.add_edges_from(model.get_all_edges())

            # Remove documents from the exclude list
            g.remove_nodes_from(exclude)

            # Get year of each paper for assembling personalization array next
            db = MyMySQL(db=config.DATASET)
            rows = db.select(["id", "year"], table="papers")
            years = {}
            for pub_id, year in rows:
                if year is not None:
                    years[pub_id] = year

            # Calculate the median to use in the missing values
            year_median = np.median(years.values())

            # Create a personalization array by exponentially decaying
            # each paper's factor by its age
            pers = {}
            for node in g.nodes():
                if (node not in years) or (years[node] < 1960) or (years[node]
                                                                   > 2013):
                    years[node] = year_median

                pers[node] = np.exp(float(years[node] - 2013) / self.tau)
            #				try :
            #				except Warning:
            #					print "Warning!"
            #					print node
            #					print year
            #					print

            print "Running PageRank with %d nodes and age defined personalization vector." % g.number_of_nodes(
            )
            r = nx.pagerank(g, personalization=pers)

            print "Writing results"
            cPickle.dump(r, open(file_path, "w"))

        # Loads cached page rank values for every node
        r = cPickle.load(open(file_path, "r"))

        # Sorts documents decreasingly by page rank value
        ids, _score_ = zip(
            *sorted(r.items(), key=lambda (k, v): v, reverse=True))

        # Fetches all document that have at least one of the terms.
        # Store them in a set for fast lookup
        pub_ids = self.index.search(query,
                                    search_fields=["title", "abstract"],
                                    return_fields=["id"],
                                    ignore=exclude)

        pub_ids = set([pid for (pid, ) in pub_ids])

        results = []
        for id in ids:
            if id in pub_ids:
                results.append(id)
                if len(results) == limit:
                    break

        return results
Exemplo n.º 25
0
import lxml.html
from pylucene import DocField, Index
from mymysql.mymysql import MyMySQL
import random
from utils import progress
import config
import os
from random import Random

#URL_TEMPLATE = "http://www.informatik.uni-trier.de/~ley/db/conf/index-%s.html"
#URL_TEMPLATE = "http://dblp.uni-trier.de/db/hc/conf/index-%s.html"
URL_TEMPLATE = "http://dblp.uni-trier.de/db/%s"

IGNORE_TERMS = ["proceedings", "proc."]

db = MyMySQL(db='aminer')


def download_venues(venue_type):
    '''
	Venue types available: ['conf', 'journals'].
	'''

    folder = config.DATA + ("venues/html/%s/" % venue_type)
    url = URL_TEMPLATE % venue_type

    pos = 1
    while (True):

        print "Processing %d-%d" % (pos, pos + 99)
Exemplo n.º 26
0
class Downloader():
    def __init__(self):
        '''
		Stores the process id and creates a task manager to get 
		and update tasks.
		'''
        # Zeno task manager
        self.tasks = zeno.TasksManager("tasks",
                                       host=config.DB_HOST,
                                       user=config.DB_USER,
                                       passwd=config.DB_PASSWD)

        # Database connection
        self.db = MyMySQL(db=config.DB_NAME,
                          host=config.DB_HOST,
                          user=config.DB_USER,
                          passwd=config.DB_PASSWD)

        # Logging configuration
        self.log = utils.config_logging(
            'downloader',
            stream=sys.stdout,
            level=logging.DEBUG,
            format='%(asctime)s (%(name)s) [%(levelname)6s]: %(message)s',
            datefmt="%Y-%m-%d %H:%M:%S")

    def parse_error(self, content):
        '''
		Parsers the returned response's HTML and throws the appropriate exception.
		'''
        if content.find("Download Limit Exceeded"):
            raise LimitReachedException()
        else:
            raise Exception()

    def make_csx_url(self, id):
        return "http://citeseerx.ist.psu.edu/viewdoc/download?doi=%s&rep=rep1&type=pdf" % id

    def download_from_csx(self, paper_id):
        ''' 
		Downloads the given image URL. 
		'''

        # Get url from the database
        url = "http://citeseerx.ist.psu.edu/viewdoc/download?doi=%s&rep=rep1&type=pdf" % paper_id

        headers = {'User-Agent': 'Chrome/34.0.1847.116 (X11; Linux x86_64)'}
        response = requests.get(url, headers=headers)

        if (response.status_code != 200):
            raise RequestException("%d: %s" %
                                   (response.status_code, response.reason))

        if response.headers['Content-Type'].startswith('text/html'):
            self.parse_error(response.content)

        # Save file to the local disk
        file_path = os.path.join(self.data_folder, "%s.pdf" % paper_id)
        img_file = open(file_path, "wb")
        img_file.write(response.content)
        img_file.close()

    def get_all_urls(self, paper_id):
        ''' Returns the external paper URL if available. '''

        cluster_id = self.db.select_one("cluster",
                                        table="papers",
                                        where="id='%s'" % paper_id)

        alt_paper_ids = self.db.select("id",
                                       table="papers",
                                       where="cluster=%d" % cluster_id)

        urls = []
        for altern_id in alt_paper_ids:
            urls = urls + [self.make_csx_url(altern_id)]

            other_urls = self.db.select("url",
                                        table="urls",
                                        where="paperid='%s'" % altern_id)
            urls = other_urls + urls

        return urls

    def download(self, paper_id):
        ''' 
		Downloads the given image URL. 
		'''
        headers = {'User-Agent': 'Chrome/34.0.1847.116 (X11; Linux x86_64)'}

        # Get url from the database
        urls = self.get_all_urls(paper_id)
        for url in urls:

            # Only supports PDF for now
            if url[-3:].lower() != "pdf":
                continue

            try:
                response = requests.get(url, headers=headers)
            except ConnectionError:
                self.log.warn("Connection error! Ignoring URL '%s'" % (url))
                continue

            response_type = response.headers['Content-Type']

            if response_type.startswith('text/html'):
                if response.content.find("Download Limit Exceeded") >= 0:
                    raise LimitReachedException()
                else:
                    continue

            if (response.status_code != 200) or (response_type !=
                                                 "application/pdf"):
                continue

# 				raise MissingURLException()
# 			if (response.status_code != 200) :
# 				raise RequestException("%d: %s" % (response.status_code, response.reason))

# Save file to the local disk
            file_path = config.PDF_PATH % paper_id
            img_file = open(file_path, "wb")
            img_file.write(response.content)
            img_file.close()

            # Download successfully completed
            return True

        # If we got here, no valid URL was found
        return False

    def run(self):

        self.log.info("Starting %s." % os.getpid())

        # Keep running until a stop file is found
        while (not os.path.exists("stop")):

            try:
                paper_id = self.tasks.get_next("START")

                if not self.download(paper_id):
                    raise DownloadException("Could not download paper '%s'." %
                                            paper_id)

                # Update the task status and the disk in which the file was saved.
                self.tasks.update_success(paper_id, "DOWNLOADED")

                # Everything went OK if got here
                self.log.info("%s: OK" % paper_id)

            # Nothing to collect
            except NothingToProcessException:
                self.log.error("Nothing to process.")
                break

            except LimitReachedException:
                self.log.error("Request limit reached!! Waiting...")
                self.tasks.update_release(
                    paper_id, "Request limit reached. Will try again later.")
                time.sleep(60 * 60)

            # URL missing in the DB or not returning the resource.
            except DownloadException, e:
                self.log.error("%s: FAIL" % (paper_id))
                self.tasks.update_error(paper_id, message=str(e))

            # Request errors
# 			except RequestException, e:
# 				self.log("%s: %s\n%s" % (paper_id, e.msg, traceback.format_exc()), show=True)
# 				self.db.update_status(paper_id, DBManager.DOWNLOAD_ERROR)

# Any other exception we log the traceback, update the DB and life goes on
            except Exception, e:
                self.log.error("%s: FAIL: %s" %
                               (paper_id, traceback.format_exc()))
                self.tasks.update_error(paper_id, message=str(e))
Exemplo n.º 27
0
from contexts import contexts
from collections import Counter, defaultdict
import nltk
import utils
from utils import tokenize
import logging
import sys
from config import DATA, TOKENS_PATH, DB_NAME
from sklearn.feature_extraction.text import TfidfVectorizer

stopwords = set(nltk.corpus.stopwords.words('english'))

NUMBER = re.compile("\d+$")
VARIABLE = re.compile("\w\d$")

db = MyMySQL(db=DB_NAME)


def filter_tokens(tokens):
    ''' 
	Filter some tokens before the analysis (infrequent, numbers, variables names).
	'''
    valid = lambda (token, _freq) : \
         not (token in stopwords) and \
         (len(token) >= 3) and \
         not re.match(NUMBER, token)

    return filter(valid, tokens)


def read_line(line):
Exemplo n.º 28
0
Created on May 26, 2015

@author: luamct
'''
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.preprocessing.label import MultiLabelBinarizer
from mymysql.mymysql import MyMySQL
from collections import Counter, defaultdict
import numpy as np
import random
import nltk
from utils import PubTexts

db = MyMySQL(db='csx', user='******', passwd='')

#rows = db.select(fields=["id", "title", "abstract"], table="papers")
#pubs = {str(id): (title, abs) for id, title, abs in rows}
#pubs = {str(id): (title + ' ' + abs) for id, title, abs in rows}

MAX_KWS = 10


def get_keywords(min=1):

    kws = db.select(fields=("id", "kw"),
                    table=("papers", "keywords"),
                    join_on=("id", "paper_id"))
    count = Counter([kw for _pid, kw in kws])
Exemplo n.º 29
0
@author: hugo
'''

import os
import sys
import time
import numpy as np
import config
from collections import defaultdict
from mymysql.mymysql import MyMySQL
from datasets.mag import get_selected_pubs
from ranking.kddcup_searchers import SimpleSearcher, Searcher
from evaluation.kddcup_expt import get_results_file, save_results

db = MyMySQL(db=config.DB_NAME, user=config.DB_USER, passwd=config.DB_PASSWD)


def rank_affils(selected_affils,
                conf_name,
                year,
                searcher,
                show=True,
                results_file=None):
    conf_id = db.select("id",
                        "confs",
                        where="abbr_name='%s'" % conf_name,
                        limit=1)[0]
    start = time.time()

    if searcher.name() == "SimpleSearcher":
Exemplo n.º 30
0
'''
Created on Jun 9, 2015

@author: luamct
'''
from collections import defaultdict
from mymysql.mymysql import MyMySQL
from utils import progress, plot
import sys
import config
from pylucene import Index, DocField

IGNORE_TERMS = ["proceedings", "proc."]

db = MyMySQL(db='aminer', user='******', passwd='')


def load_existing_venues():
    rows = db.select(fields=["id", "name"], table="venues")
    return {name: int(id) for id, name in rows}


def save_venue(id, name):
    db.insert(into="venues", fields=["id", "name"], values=[id, name])


def save_citations(id, cits):
    values = [(id, cid) for cid in cits]
    db.insert(into="graph", fields=["citing", "cited"], values=values)