Exemplo n.º 1
0
def main(argv):
    query = None
    usr = None
    output_file = None
    pwd = None
    n = 20

    try:
        opts, _args_ = getopt.getopt(argv, "hq:o:n:u:p:")
    except getopt.GetoptError:
        usage()
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            sys.exit()

        elif opt == "-q":
            query = arg

        elif opt == "-o":
            output_file = arg

        elif opt == "-n":
            n = int(arg)

        elif opt == "-u":
            usr = arg

        elif opt == "-p":
            pwd = arg

        else:
            print "Invalid option: %s" % opt

    # Check mandatory arguments
    if (not query or not usr or not pwd):
        usage()
        sys.exit(2)

    s = searchers.Searcher(**config.PARAMS)
    pub_ids = s.search(query, limit=n)

    if not output_file:
        output_file = utils.get_graph_file_name(query)

    # Writes the graph structure as a gexf file
    nx.write_gexf(s.graph, output_file)

    # Prints the results
    db = MyMySQL(db='csx', user=usr, passwd=pwd)
    for id in pub_ids:
        print "%12s\t %s" % (
            id, db.select_one("title", table="papers", where="id='%s'" % id))
Exemplo n.º 2
0
def find_ids_unsupervised(titles, index_folder):

    db = MyMySQL(db='csx')
    index = Index(index_folder)

    found = 0
    doc_ids = []
    for title in titles:
        top_docs, scores = index.search(title,
                                        search_fields=["title"],
                                        return_fields=["id"],
                                        return_scores=True,
                                        limit=5)
        #		ids = index.get_documents(top_docs, fields="id")

        # To decide if the most similar title in the index is a hit we check if its score
        # is significantly higher than those of the hits that follow it (second to sixth)

        if len(scores) > 2 and (scores[0] > 2 * np.mean(scores[1:])):
            doc_ids.append(top_docs[0][0])
            found += 1
        else:
            doc_ids.append("")

        # Only enable for debugging and finding a threshold
        if 0:
            print "-------"
            print "%s" % (title)
            print "-------"
            for i, (id, ) in enumerate(top_docs):
                title = db.select_one("title",
                                      table="papers",
                                      where="id='%s'" % id)
                print "%.2f\t%s" % (scores[i], title.encode("UTF-8"))

            if (scores[0] > 2 * np.mean(scores[1:])):
                print "Found!",
                op = '>'
            else:
                print "Not found!",
                op = '<'

            print "(%.2f %s %.2f)\n" % (scores[0], op, 2 * np.mean(scores[1:]))

    return doc_ids
Exemplo n.º 3
0
def get_texts(pub_ids, use_title=True, use_abs=True) :
  '''
  This is a non-batch version. Much slower but more
  memory efficient.
  '''
  db = MyMySQL(db='csx', user='******', passwd='')

  fields = []
  if use_title: fields.append("title")
  if use_abs: fields.append("abstract")

  texts = []
  for pub_id in pub_ids:
    text_fields = db.select_one(fields=fields, table="papers", where="id='%s'" % pub_id)
    text = ''
    for tf in text_fields:
      if tf is not None:
        text += tf

    texts.append(text)

  return texts
Exemplo n.º 4
0
class Tokenizer():
    def __init__(self):

        # Zeno task manager
        self.tasks = zeno.TasksManager("tasks",
                                       host=config.DB_HOST,
                                       user=config.DB_USER,
                                       passwd=config.DB_PASSWD)

        # Database connection
        self.db = MyMySQL(db=config.DB_NAME,
                          host=config.DB_HOST,
                          user=config.DB_USER,
                          passwd=config.DB_PASSWD)

        # Logging configuration
        self.log = utils.config_logging(
            'tokenizer',
            stream=sys.stdout,
            level=logging.DEBUG,
            format='%(asctime)s (%(name)s) [%(levelname)6s]: %(message)s',
            datefmt="%Y-%m-%d %H:%M:%S")

        self.MIN_TOKENS = 10

        # Create folders with non existing
        utils.ensure_folder(os.path.dirname(config.TOKENS_PATH))
        utils.ensure_folder(os.path.dirname(config.TOKENS_PATH_PARTS))

    def save_tokens(self, tokens, tok_file):
        counter = Counter(tokens)
        with open(tok_file, 'w') as f:
            # 			print >> f, (' '.join(tokens)).encode("utf-8")
            lines = [
                "%s %d" % (token, count) for (token, count) in counter.items()
            ]
            print >> f, '\n'.join(lines).encode("UTF-8")

    def get_section(self, html_file, possible_section_names,
                    possible_next_sections):

        # Open and parse HTML, then extract all textual content from each paragraph
        h = html.parse(html_file)  #, parser=etree.XMLParser(encoding="utf-8"))
        pars = [
            paragraph.text_content().lower().encode("UTF-8")
            for paragraph in h.xpath("//p")
        ]  # .encode("utf-8")

        # First we go backwards trying to find the latest occurrence of
        # one of the possible names of the section of interest
        begin = None
        for i in reversed(xrange(len(pars))):
            if match_any(pars[i], possible_section_names):
                begin = i
                break

        # If the start wasn't found, just halt right away
        if (begin is None):
            return ""

        # Otherwise we can look for the end of the section starting from the start
        # of the found section.
        end = None
        for j in xrange(begin + 1, len(pars)):
            if match_any(pars[j], possible_next_sections):
                end = j
                break

        # End of section not found, so it's not safe to keep this content,
        # so we return an empty string.
        if (end is None):
            return ""

        # Otherwise join all paragraphs inside the section found
        return unicode("".join([fix_hyphens(p) for p in pars[begin:end]]),
                       "UTF-8")

    def get_title_and_abstract(self, paper_id):
        title, abstract = self.db.select_one(["title", "abstract"],
                                             table="papers",
                                             where="id='%s'" % paper_id)
        if title is None: title = ""
        if abstract is None: abstract = ""

        return title, abstract

    def process_full_text(self, paper_id):
        '''
		Tokenizes and store in disk the full text of the document provided.
		'''
        txt_file = config.TXT_PATH % paper_id
        tok_file = config.TOKENS_PATH % paper_id

        with open(txt_file, 'r') as f:
            text = unicode(f.read(), "utf-8")

        tokens = utils.tokenize(text)
        if (len(tokens) < self.MIN_TOKENS):
            raise MinimumTokensException(
                '''Minimum number of tokens (%d) could not be extracted. 
			 				Document is likely to be badly encoded.''' % self.MIN_TOKENS)

        self.save_tokens(tokens, tok_file)

    def process_important_parts(self, paper_id):
        '''
		Tokenizes some specific parts of the document deemed as important, like
		the title, abstract and conclusion.
		'''
        html_file = config.HTML_PATH % paper_id
        tokens_file = config.TOKENS_PATH_PARTS % paper_id

        # Get title and abstract from DB
        title, abstract = self.get_title_and_abstract(paper_id)

        # Get conclusion from full text
        conclusion = self.get_section(
            html_file, ['conclusion', 'concluding', 'summary'],
            ['reference', 'bibliography', 'acknowledg', 'appendix'])

        # Uncomment if you don't want to use the abstract from the DB
        #		abstract = self.get_section(html_file, ['abstract'], ['categories', 'keywords', 'introduction'])

        # Tokenize each part and save into a file
        tokens = []
        tokens += utils.tokenize(title)
        tokens += utils.tokenize(abstract)
        tokens += utils.tokenize(conclusion)

        if (len(tokens) < self.MIN_TOKENS):
            raise MinimumTokensException(
                ("Minimum number of tokens (%d) could not be extracted." %
                 self.MIN_TOKENS) +
                "Document is likely to have decoding problems.")

        self.save_tokens(tokens, tokens_file)

    def run(self):

        self.log.info("Starting process %d" % os.getpid())

        # Keep running until a stop file is found
        while (not os.path.exists("stop")):

            try:
                paper_id = self.tasks.get_next("CONVERTED")

                # Pre-processes the full text and only the important parts to different folders
                self.process_full_text(paper_id)
                self.process_important_parts(paper_id)

                # Update the task status and the disk in which the file was saved.
                self.tasks.update_success(paper_id, "TOKENIZED")

                # Everything went OK if got here
                self.log.info("%s: OK" % paper_id)

            # Nothing to collect
            except NothingToProcessException:
                self.log.info("Nothing to process.")
                break

            except MinimumTokensException, e:
                self.log.error("%s: FAIL\n%s\n" %
                               (paper_id, traceback.format_exc()))
                self.tasks.update_error(paper_id, message=str(e))

            # Any other exception we log the traceback and update the DB
            except Exception:
                self.log.error("%s: FAIL\n%s\n" %
                               (paper_id, traceback.format_exc()))
                self.tasks.update_error(paper_id, "TOKENIZE_ERROR")
Exemplo n.º 5
0
class Downloader():
    def __init__(self):
        '''
		Stores the process id and creates a task manager to get 
		and update tasks.
		'''
        # Zeno task manager
        self.tasks = zeno.TasksManager("tasks",
                                       host=config.DB_HOST,
                                       user=config.DB_USER,
                                       passwd=config.DB_PASSWD)

        # Database connection
        self.db = MyMySQL(db=config.DB_NAME,
                          host=config.DB_HOST,
                          user=config.DB_USER,
                          passwd=config.DB_PASSWD)

        # Logging configuration
        self.log = utils.config_logging(
            'downloader',
            stream=sys.stdout,
            level=logging.DEBUG,
            format='%(asctime)s (%(name)s) [%(levelname)6s]: %(message)s',
            datefmt="%Y-%m-%d %H:%M:%S")

    def parse_error(self, content):
        '''
		Parsers the returned response's HTML and throws the appropriate exception.
		'''
        if content.find("Download Limit Exceeded"):
            raise LimitReachedException()
        else:
            raise Exception()

    def make_csx_url(self, id):
        return "http://citeseerx.ist.psu.edu/viewdoc/download?doi=%s&rep=rep1&type=pdf" % id

    def download_from_csx(self, paper_id):
        ''' 
		Downloads the given image URL. 
		'''

        # Get url from the database
        url = "http://citeseerx.ist.psu.edu/viewdoc/download?doi=%s&rep=rep1&type=pdf" % paper_id

        headers = {'User-Agent': 'Chrome/34.0.1847.116 (X11; Linux x86_64)'}
        response = requests.get(url, headers=headers)

        if (response.status_code != 200):
            raise RequestException("%d: %s" %
                                   (response.status_code, response.reason))

        if response.headers['Content-Type'].startswith('text/html'):
            self.parse_error(response.content)

        # Save file to the local disk
        file_path = os.path.join(self.data_folder, "%s.pdf" % paper_id)
        img_file = open(file_path, "wb")
        img_file.write(response.content)
        img_file.close()

    def get_all_urls(self, paper_id):
        ''' Returns the external paper URL if available. '''

        cluster_id = self.db.select_one("cluster",
                                        table="papers",
                                        where="id='%s'" % paper_id)

        alt_paper_ids = self.db.select("id",
                                       table="papers",
                                       where="cluster=%d" % cluster_id)

        urls = []
        for altern_id in alt_paper_ids:
            urls = urls + [self.make_csx_url(altern_id)]

            other_urls = self.db.select("url",
                                        table="urls",
                                        where="paperid='%s'" % altern_id)
            urls = other_urls + urls

        return urls

    def download(self, paper_id):
        ''' 
		Downloads the given image URL. 
		'''
        headers = {'User-Agent': 'Chrome/34.0.1847.116 (X11; Linux x86_64)'}

        # Get url from the database
        urls = self.get_all_urls(paper_id)
        for url in urls:

            # Only supports PDF for now
            if url[-3:].lower() != "pdf":
                continue

            try:
                response = requests.get(url, headers=headers)
            except ConnectionError:
                self.log.warn("Connection error! Ignoring URL '%s'" % (url))
                continue

            response_type = response.headers['Content-Type']

            if response_type.startswith('text/html'):
                if response.content.find("Download Limit Exceeded") >= 0:
                    raise LimitReachedException()
                else:
                    continue

            if (response.status_code != 200) or (response_type !=
                                                 "application/pdf"):
                continue

# 				raise MissingURLException()
# 			if (response.status_code != 200) :
# 				raise RequestException("%d: %s" % (response.status_code, response.reason))

# Save file to the local disk
            file_path = config.PDF_PATH % paper_id
            img_file = open(file_path, "wb")
            img_file.write(response.content)
            img_file.close()

            # Download successfully completed
            return True

        # If we got here, no valid URL was found
        return False

    def run(self):

        self.log.info("Starting %s." % os.getpid())

        # Keep running until a stop file is found
        while (not os.path.exists("stop")):

            try:
                paper_id = self.tasks.get_next("START")

                if not self.download(paper_id):
                    raise DownloadException("Could not download paper '%s'." %
                                            paper_id)

                # Update the task status and the disk in which the file was saved.
                self.tasks.update_success(paper_id, "DOWNLOADED")

                # Everything went OK if got here
                self.log.info("%s: OK" % paper_id)

            # Nothing to collect
            except NothingToProcessException:
                self.log.error("Nothing to process.")
                break

            except LimitReachedException:
                self.log.error("Request limit reached!! Waiting...")
                self.tasks.update_release(
                    paper_id, "Request limit reached. Will try again later.")
                time.sleep(60 * 60)

            # URL missing in the DB or not returning the resource.
            except DownloadException, e:
                self.log.error("%s: FAIL" % (paper_id))
                self.tasks.update_error(paper_id, message=str(e))

            # Request errors
# 			except RequestException, e:
# 				self.log("%s: %s\n%s" % (paper_id, e.msg, traceback.format_exc()), show=True)
# 				self.db.update_status(paper_id, DBManager.DOWNLOAD_ERROR)

# Any other exception we log the traceback, update the DB and life goes on
            except Exception, e:
                self.log.error("%s: FAIL: %s" %
                               (paper_id, traceback.format_exc()))
                self.tasks.update_error(paper_id, message=str(e))