Exemplo n.º 1
0
    def __init__(self, root, **kwargs):
        """
	Initialize a PLoS reader with a specific corpus. Corpus 
	information is contained in 'root/corpus_info.json' file. The

        @type  root: string
	@param root: The directory path to the corpus directory.
        """
        self._root = root
        fp = open( '%s/corpus_info.json' % (root), 'r' )
        self._corpus_info = info = json.load(fp)
        fp.close()

        # doc_part is specific to PLoS and research article in general.
	# 'abstract' and 'body' are currently supported.
	# The corpus contains seperate text for each, but the 
	# reader is initialized to readi only one.
	if 'doc_part' in kwargs:
            self._doc_part = doc_part = kwargs['doc_part']
	    del kwargs['doc_part']
	else:
	    self._doc_part = doc_part = 'body'
	if 'fileids' not in kwargs:
            fileids = [ doi2fn(d, doc_part) for d in info['d2c'].keys() ] 
        else:
	    fileids =  kwargs['fileids']
        # cat_map f -> [ c1, c2, ...]
	# The fileids depend on what the doc_part is ('body', 'abstract')
	cat_map = {}
        for d,cat in info['d2c'].iteritems():
            cat_map[doi2fn(d, doc_part)] = cat

	kwargs['cat_map'] = cat_map
	# Subclass of Categorized Plaintext Corpus Reader
        CategorizedPlaintextCorpusReader.__init__(self, root, fileids, **kwargs)
Exemplo n.º 2
0
 def _write_doc(self, base_dir, doc, doi):
   """
   Write the abstract and body files.
   """
   fn_body = '{d}/{f}'.format(d=base_dir, f=doi2fn(doi, 'body'))
   fn_abstract = '{d}/{f}'.format(d=base_dir, f=doi2fn(doi, 'abstract'))
   with codecs.open(fn_body, 'w', encoding='utf-8') as fd_body:
     fd_body.write(doc['body'])
   with codecs.open(fn_abstract, 'w', encoding='utf-8') as fd_abstract:
     fd_abstract.write(doc['abstract'][0])
   return
Exemplo n.º 3
0
 def _article_info(self, doc, doi):
   fields = ['title', 'author', 
             'editor', 'publication_date', 
             'article_type', 'journal',
             'id' ]
   article_info = field_list_to_dict(doc, fields)
   article_info['page_url'] = article_page_url(doi, pretty=True)
   article_info['xml_url'] = article_xml_url(doi, pretty=True)
   article_info['body_fid'] = doi2fn(doi, 'body')
   article_info['abstract_fid'] = doi2fn(doi, 'abstract')
   return article_info
Exemplo n.º 4
0
 def _write_doc(self, base_dir, doc, doi):
     """
 Write the abstract and body files.
 """
     fn_body = '{d}/{f}'.format(d=base_dir, f=doi2fn(doi, 'body'))
     fn_abstract = '{d}/{f}'.format(d=base_dir, f=doi2fn(doi, 'abstract'))
     with codecs.open(fn_body, 'w', encoding='utf-8') as fd_body:
         fd_body.write(doc['body'])
     with codecs.open(fn_abstract, 'w', encoding='utf-8') as fd_abstract:
         fd_abstract.write(doc['abstract'][0])
     return
Exemplo n.º 5
0
 def _article_info(self, doc, doi):
     fields = [
         'title', 'author', 'editor', 'publication_date', 'article_type',
         'journal', 'id'
     ]
     article_info = field_list_to_dict(doc, fields)
     article_info['page_url'] = article_page_url(doi, pretty=True)
     article_info['xml_url'] = article_xml_url(doi, pretty=True)
     article_info['body_fid'] = doi2fn(doi, 'body')
     article_info['abstract_fid'] = doi2fn(doi, 'abstract')
     return article_info
Exemplo n.º 6
0
  def __init__(self, root, **kwargs):
    """ 
	Initialize a PLoS reader with a specific corpus. Corpus 
	information is contained in 'root/corpus_info.json' file. The

    @type  root: string
	@param root: The directory path to the corpus.
    """
    self._root = root
    
    # corpus type is specific to Plos_builder
    # full - all documents that were built.
    # partial - documents excluding training 
    # training - documents intended for training
    if 'corpus_type' in kwargs:
      self._corpus_type = kwargs['corpus_type']
      del kwargs['corpus_type']
    else:
      self._corpus_type = 'full'
    
    fn = '{d}/{t}_corpus_info.json'.format(d=root, t=self._corpus_type)
    with open( fn, 'r' ) as fp:
      self._corpus_info = info = json.load(fp)

    # doc_part is specific to PLoS and research article.
	# 'abstract' and 'body' are currently supported.
	# The corpus contains seperate text for each, but the 
	# reader is initialized to read only one.
    if 'doc_part' in kwargs:
      self._doc_part = doc_part = kwargs['doc_part']
      del kwargs['doc_part']
    else:
      self._doc_part = doc_part = 'body'
    
    if 'fileids' not in kwargs:
      fileids = [ doi2fn(d, doc_part) for d in self.dois() ] 
    else:
	    fileids =  kwargs['fileids']
    # cat_map f -> [ c1, c2, ...]
	# The fileids depend on what the doc_part is ('body', 'abstract')
    kwargs['cat_map'] = { doi2fn(d, doc_part) : cat for d,cat in info['dois_to_categories'].iteritems() }
	  # Subclass of Categorized Plaintext Corpus Reader
    CategorizedPlaintextCorpusReader.__init__(self, root, fileids, **kwargs)
Exemplo n.º 7
0
 def doi2fid(self, doi_lst=None):
     """
     """
     dois = listafy(doi_lst, self._corpus_info['d2c']) 
     return zip(dois, doi2fn(dois, self._doc_part))
Exemplo n.º 8
0
    def add(self, docs):
        """
        Create a json file for each doc in the document list.

        @type docs: list
        @param docs: A list containing the results of a PLoS search query.
                     Each item is a dictionary with QUERY_RTN_FLDS as keys.

        @return: Nothing
        """
        root = self._root
        info = self._corpus_info
        d2cmap = {}
        c2dmap = {}
        d2infomap = {}
        amap = info['article_link']
        xmap = info['xml_link']
        # Build all the lists and mappings
        for doc in docs:
            doi = doc['id']
            # If the doc has not subject, add []
            if 'subject' not in doc:
                doc['subject'] = []
            # File id -> [ c1, c2, .... ]
            d2cmap[doi] = subjs = doc['subject']
            # Category -> [ f1, f2, .... ]
            for s in subjs:
                if s in c2dmap:
                    c2dmap[s].append(doi)
                else:
                    c2dmap[s] = [doi]
            # doi -> article link
            amap[doi] = articleUrl(doi)
            # doi -> artilce xml link
            xmap[doi] = articleXML(doi)
            # Depending on the article type some of these might not exist.
            jrnl = doc['journal'] if 'journal' in doc else ''
            if 'publication_date' in doc:
                pub_date = doc['publication_date']
            else:
                pub_date = ''
            atype = doc['article_type'] if 'article_type' in doc else ''
            title = doc['title'] if 'title' in doc else ''
            author = doc['author'] if 'author' in doc else []

            d2infomap[doi] = (jrnl, pub_date, atype, title, author)

        fnames = [doi2fn(doi, 'body') for doi in d2cmap.keys()]
        fnames_docs = zip(fnames, docs)

        # Dump doc_part 'body' into individual files.
        for fn, doc in fnames_docs:
            fd = codecs.open('%s/%s' % (root, fn), 'w', encoding='utf-8')
            fd.write(doc['body'])
            fd.close()

        fnames = [doi2fn(doi, 'abstract') for doi in d2cmap.keys()]
        fnames_docs = zip(fnames, docs)

        # Dump doc_part 'abstract' into individual files.
        for fn, doc in fnames_docs:
            fd = codecs.open('%s/%s' % (root, fn), 'w', encoding='utf-8')
            fd.write(doc['abstract'][0])
            fd.close()

        # Update the corpus info
        info['d2c'].update(d2cmap)

        c2d = info['c2d']
        for k, v in c2dmap.iteritems():
            if k not in c2d:
                c2d[k] = []
            c2d[k].extend(v)

        info['d2info'].update(d2infomap)
        return