def main(argv): query = None usr = None output_file = None pwd = None n = 20 try: opts, _args_ = getopt.getopt(argv, "hq:o:n:u:p:") except getopt.GetoptError: usage() sys.exit(2) for opt, arg in opts: if opt == '-h': sys.exit() elif opt == "-q": query = arg elif opt == "-o": output_file = arg elif opt == "-n": n = int(arg) elif opt == "-u": usr = arg elif opt == "-p": pwd = arg else: print "Invalid option: %s" % opt # Check mandatory arguments if (not query or not usr or not pwd): usage() sys.exit(2) s = searchers.Searcher(**config.PARAMS) pub_ids = s.search(query, limit=n) if not output_file: output_file = utils.get_graph_file_name(query) # Writes the graph structure as a gexf file nx.write_gexf(s.graph, output_file) # Prints the results db = MyMySQL(db='csx', user=usr, passwd=pwd) for id in pub_ids: print "%12s\t %s" % ( id, db.select_one("title", table="papers", where="id='%s'" % id))
def find_ids_unsupervised(titles, index_folder): db = MyMySQL(db='csx') index = Index(index_folder) found = 0 doc_ids = [] for title in titles: top_docs, scores = index.search(title, search_fields=["title"], return_fields=["id"], return_scores=True, limit=5) # ids = index.get_documents(top_docs, fields="id") # To decide if the most similar title in the index is a hit we check if its score # is significantly higher than those of the hits that follow it (second to sixth) if len(scores) > 2 and (scores[0] > 2 * np.mean(scores[1:])): doc_ids.append(top_docs[0][0]) found += 1 else: doc_ids.append("") # Only enable for debugging and finding a threshold if 0: print "-------" print "%s" % (title) print "-------" for i, (id, ) in enumerate(top_docs): title = db.select_one("title", table="papers", where="id='%s'" % id) print "%.2f\t%s" % (scores[i], title.encode("UTF-8")) if (scores[0] > 2 * np.mean(scores[1:])): print "Found!", op = '>' else: print "Not found!", op = '<' print "(%.2f %s %.2f)\n" % (scores[0], op, 2 * np.mean(scores[1:])) return doc_ids
def get_texts(pub_ids, use_title=True, use_abs=True) : ''' This is a non-batch version. Much slower but more memory efficient. ''' db = MyMySQL(db='csx', user='******', passwd='') fields = [] if use_title: fields.append("title") if use_abs: fields.append("abstract") texts = [] for pub_id in pub_ids: text_fields = db.select_one(fields=fields, table="papers", where="id='%s'" % pub_id) text = '' for tf in text_fields: if tf is not None: text += tf texts.append(text) return texts
class Tokenizer(): def __init__(self): # Zeno task manager self.tasks = zeno.TasksManager("tasks", host=config.DB_HOST, user=config.DB_USER, passwd=config.DB_PASSWD) # Database connection self.db = MyMySQL(db=config.DB_NAME, host=config.DB_HOST, user=config.DB_USER, passwd=config.DB_PASSWD) # Logging configuration self.log = utils.config_logging( 'tokenizer', stream=sys.stdout, level=logging.DEBUG, format='%(asctime)s (%(name)s) [%(levelname)6s]: %(message)s', datefmt="%Y-%m-%d %H:%M:%S") self.MIN_TOKENS = 10 # Create folders with non existing utils.ensure_folder(os.path.dirname(config.TOKENS_PATH)) utils.ensure_folder(os.path.dirname(config.TOKENS_PATH_PARTS)) def save_tokens(self, tokens, tok_file): counter = Counter(tokens) with open(tok_file, 'w') as f: # print >> f, (' '.join(tokens)).encode("utf-8") lines = [ "%s %d" % (token, count) for (token, count) in counter.items() ] print >> f, '\n'.join(lines).encode("UTF-8") def get_section(self, html_file, possible_section_names, possible_next_sections): # Open and parse HTML, then extract all textual content from each paragraph h = html.parse(html_file) #, parser=etree.XMLParser(encoding="utf-8")) pars = [ paragraph.text_content().lower().encode("UTF-8") for paragraph in h.xpath("//p") ] # .encode("utf-8") # First we go backwards trying to find the latest occurrence of # one of the possible names of the section of interest begin = None for i in reversed(xrange(len(pars))): if match_any(pars[i], possible_section_names): begin = i break # If the start wasn't found, just halt right away if (begin is None): return "" # Otherwise we can look for the end of the section starting from the start # of the found section. end = None for j in xrange(begin + 1, len(pars)): if match_any(pars[j], possible_next_sections): end = j break # End of section not found, so it's not safe to keep this content, # so we return an empty string. if (end is None): return "" # Otherwise join all paragraphs inside the section found return unicode("".join([fix_hyphens(p) for p in pars[begin:end]]), "UTF-8") def get_title_and_abstract(self, paper_id): title, abstract = self.db.select_one(["title", "abstract"], table="papers", where="id='%s'" % paper_id) if title is None: title = "" if abstract is None: abstract = "" return title, abstract def process_full_text(self, paper_id): ''' Tokenizes and store in disk the full text of the document provided. ''' txt_file = config.TXT_PATH % paper_id tok_file = config.TOKENS_PATH % paper_id with open(txt_file, 'r') as f: text = unicode(f.read(), "utf-8") tokens = utils.tokenize(text) if (len(tokens) < self.MIN_TOKENS): raise MinimumTokensException( '''Minimum number of tokens (%d) could not be extracted. Document is likely to be badly encoded.''' % self.MIN_TOKENS) self.save_tokens(tokens, tok_file) def process_important_parts(self, paper_id): ''' Tokenizes some specific parts of the document deemed as important, like the title, abstract and conclusion. ''' html_file = config.HTML_PATH % paper_id tokens_file = config.TOKENS_PATH_PARTS % paper_id # Get title and abstract from DB title, abstract = self.get_title_and_abstract(paper_id) # Get conclusion from full text conclusion = self.get_section( html_file, ['conclusion', 'concluding', 'summary'], ['reference', 'bibliography', 'acknowledg', 'appendix']) # Uncomment if you don't want to use the abstract from the DB # abstract = self.get_section(html_file, ['abstract'], ['categories', 'keywords', 'introduction']) # Tokenize each part and save into a file tokens = [] tokens += utils.tokenize(title) tokens += utils.tokenize(abstract) tokens += utils.tokenize(conclusion) if (len(tokens) < self.MIN_TOKENS): raise MinimumTokensException( ("Minimum number of tokens (%d) could not be extracted." % self.MIN_TOKENS) + "Document is likely to have decoding problems.") self.save_tokens(tokens, tokens_file) def run(self): self.log.info("Starting process %d" % os.getpid()) # Keep running until a stop file is found while (not os.path.exists("stop")): try: paper_id = self.tasks.get_next("CONVERTED") # Pre-processes the full text and only the important parts to different folders self.process_full_text(paper_id) self.process_important_parts(paper_id) # Update the task status and the disk in which the file was saved. self.tasks.update_success(paper_id, "TOKENIZED") # Everything went OK if got here self.log.info("%s: OK" % paper_id) # Nothing to collect except NothingToProcessException: self.log.info("Nothing to process.") break except MinimumTokensException, e: self.log.error("%s: FAIL\n%s\n" % (paper_id, traceback.format_exc())) self.tasks.update_error(paper_id, message=str(e)) # Any other exception we log the traceback and update the DB except Exception: self.log.error("%s: FAIL\n%s\n" % (paper_id, traceback.format_exc())) self.tasks.update_error(paper_id, "TOKENIZE_ERROR")
class Downloader(): def __init__(self): ''' Stores the process id and creates a task manager to get and update tasks. ''' # Zeno task manager self.tasks = zeno.TasksManager("tasks", host=config.DB_HOST, user=config.DB_USER, passwd=config.DB_PASSWD) # Database connection self.db = MyMySQL(db=config.DB_NAME, host=config.DB_HOST, user=config.DB_USER, passwd=config.DB_PASSWD) # Logging configuration self.log = utils.config_logging( 'downloader', stream=sys.stdout, level=logging.DEBUG, format='%(asctime)s (%(name)s) [%(levelname)6s]: %(message)s', datefmt="%Y-%m-%d %H:%M:%S") def parse_error(self, content): ''' Parsers the returned response's HTML and throws the appropriate exception. ''' if content.find("Download Limit Exceeded"): raise LimitReachedException() else: raise Exception() def make_csx_url(self, id): return "http://citeseerx.ist.psu.edu/viewdoc/download?doi=%s&rep=rep1&type=pdf" % id def download_from_csx(self, paper_id): ''' Downloads the given image URL. ''' # Get url from the database url = "http://citeseerx.ist.psu.edu/viewdoc/download?doi=%s&rep=rep1&type=pdf" % paper_id headers = {'User-Agent': 'Chrome/34.0.1847.116 (X11; Linux x86_64)'} response = requests.get(url, headers=headers) if (response.status_code != 200): raise RequestException("%d: %s" % (response.status_code, response.reason)) if response.headers['Content-Type'].startswith('text/html'): self.parse_error(response.content) # Save file to the local disk file_path = os.path.join(self.data_folder, "%s.pdf" % paper_id) img_file = open(file_path, "wb") img_file.write(response.content) img_file.close() def get_all_urls(self, paper_id): ''' Returns the external paper URL if available. ''' cluster_id = self.db.select_one("cluster", table="papers", where="id='%s'" % paper_id) alt_paper_ids = self.db.select("id", table="papers", where="cluster=%d" % cluster_id) urls = [] for altern_id in alt_paper_ids: urls = urls + [self.make_csx_url(altern_id)] other_urls = self.db.select("url", table="urls", where="paperid='%s'" % altern_id) urls = other_urls + urls return urls def download(self, paper_id): ''' Downloads the given image URL. ''' headers = {'User-Agent': 'Chrome/34.0.1847.116 (X11; Linux x86_64)'} # Get url from the database urls = self.get_all_urls(paper_id) for url in urls: # Only supports PDF for now if url[-3:].lower() != "pdf": continue try: response = requests.get(url, headers=headers) except ConnectionError: self.log.warn("Connection error! Ignoring URL '%s'" % (url)) continue response_type = response.headers['Content-Type'] if response_type.startswith('text/html'): if response.content.find("Download Limit Exceeded") >= 0: raise LimitReachedException() else: continue if (response.status_code != 200) or (response_type != "application/pdf"): continue # raise MissingURLException() # if (response.status_code != 200) : # raise RequestException("%d: %s" % (response.status_code, response.reason)) # Save file to the local disk file_path = config.PDF_PATH % paper_id img_file = open(file_path, "wb") img_file.write(response.content) img_file.close() # Download successfully completed return True # If we got here, no valid URL was found return False def run(self): self.log.info("Starting %s." % os.getpid()) # Keep running until a stop file is found while (not os.path.exists("stop")): try: paper_id = self.tasks.get_next("START") if not self.download(paper_id): raise DownloadException("Could not download paper '%s'." % paper_id) # Update the task status and the disk in which the file was saved. self.tasks.update_success(paper_id, "DOWNLOADED") # Everything went OK if got here self.log.info("%s: OK" % paper_id) # Nothing to collect except NothingToProcessException: self.log.error("Nothing to process.") break except LimitReachedException: self.log.error("Request limit reached!! Waiting...") self.tasks.update_release( paper_id, "Request limit reached. Will try again later.") time.sleep(60 * 60) # URL missing in the DB or not returning the resource. except DownloadException, e: self.log.error("%s: FAIL" % (paper_id)) self.tasks.update_error(paper_id, message=str(e)) # Request errors # except RequestException, e: # self.log("%s: %s\n%s" % (paper_id, e.msg, traceback.format_exc()), show=True) # self.db.update_status(paper_id, DBManager.DOWNLOAD_ERROR) # Any other exception we log the traceback, update the DB and life goes on except Exception, e: self.log.error("%s: FAIL: %s" % (paper_id, traceback.format_exc())) self.tasks.update_error(paper_id, message=str(e))