def list_articles(target_directory, supplementary_materials=False, skip=[]):
    listing = listdir(target_directory)
    for filename in listing:
        result_tree = ElementTree()
        result_tree.parse(path.join(target_directory, filename))
        for tree in result_tree.iterfind('article'):
            pmcid = _get_pmcid(tree)
            if pmcid in skip:
                continue

            result = {}
            result['name'] = pmcid
            result['doi'] = _get_article_doi(tree)
            result['article-categories'] = _get_article_categories(tree)
            result['article-contrib-authors'] = _get_article_contrib_authors(tree)
            result['article-title'] = _get_article_title(tree)
            result['article-abstract'] = _get_article_abstract(tree)
            result['journal-title'] = _get_journal_title(tree)
            result['article-year'], \
                result['article-month'], \
                result['article-day'] = _get_article_date(tree)
            result['article-url'] = _get_article_url(tree)
            result['article-license-url'], \
                result['article-license-text'], \
                result['article-copyright-statement'] = _get_article_licensing(tree)
            result['article-copyright-holder'] = _get_article_copyright_holder(tree)

            if supplementary_materials:
                result['supplementary-materials'] = _get_supplementary_materials(tree)
            yield result
Exemplo n.º 2
0
def list_articles(target_directory, supplementary_materials=False, skip=[]):
    listing = listdir(target_directory)
    for filename in listing:
        result_tree = ElementTree()
        result_tree.parse(path.join(target_directory, filename))
        for tree in result_tree.iterfind('article'):
            pmcid = _get_pmcid(tree)
            if pmcid in skip:
                continue

            result = {}
            result['name'] = pmcid
            result['doi'] = _get_article_doi(tree)
            result['article-categories'] = _get_article_categories(tree)
            result['article-contrib-authors'] = _get_article_contrib_authors(
                tree)
            result['article-title'] = _get_article_title(tree)
            result['article-abstract'] = _get_article_abstract(tree)
            result['journal-title'] = _get_journal_title(tree)
            result['article-year'], \
                result['article-month'], \
                result['article-day'] = _get_article_date(tree)
            result['article-url'] = _get_article_url(tree)
            result['article-license-url'], \
                result['article-license-text'], \
                result['article-copyright-statement'] = _get_article_licensing(tree)
            result['article-copyright-holder'] = _get_article_copyright_holder(
                tree)

            if supplementary_materials:
                result[
                    'supplementary-materials'] = _get_supplementary_materials(
                        tree)
            yield result
Exemplo n.º 3
0
def _py_gnc_import_get_split_online_id(session,split):
	if not hasattr(_py_gnc_import_get_split_online_id, "cache"):
		_py_gnc_import_get_split_online_id.cache={}
	if not session in _py_gnc_import_get_split_online_id.cache:
		cache={}
		f=open(session.get_file_path(),'rb')
		if f:
			if (f.read(2) == '\x1f\x8b'):
				f.seek(0)
				f=gzip.GzipFile(fileobj=f)
			else:
				f.seek(0)
			xml=ElementTree(file=f)
			f.close()
			ns={"slot":"http://www.gnucash.org/XML/slot",
				"split":"http://www.gnucash.org/XML/split",
				"trn":"http://www.gnucash.org/XML/trn"}
			for el_split in xml.iterfind('.//trn:split',ns):
				for el_slot in el_split.iterfind('split:slots/slot',ns):
					el_key=el_slot.find('slot:key',ns)
					if not el_key is None and el_key.text=="online_id":
						el_guid=el_split.find("split:id[@type='guid']",ns)
						el_val=el_slot.find('slot:value',ns)
						if not el_guid is None and not el_val is None:
							cache[el_guid.text]=el_val.text
		_py_gnc_import_get_split_online_id.cache[session]=cache
	if session in _py_gnc_import_get_split_online_id.cache:
		cache=_py_gnc_import_get_split_online_id.cache[session]
		guid=split.GetGUID().to_string()
		if guid in cache:
			return cache[guid]
	return None
def get_major_category_from_pmid(pmid):
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=xml' % pmid
    xml_file = _get_file_from_url(url)
    tree = ElementTree()
    tree.parse(xml_file)
    for e in tree.iterfind('PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName'):
        if e.attrib['MajorTopicYN'] == 'Y':
            return _postprocess_category(e.text)
Exemplo n.º 5
0
def _get_pmcids_from_dois(dois):
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=%s' % \
        '%20OR%20'.join([doi+'[doi]' for doi in dois])
    xml_file = _get_file_from_url(url)
    tree = ElementTree()
    tree.parse(xml_file)
    pmcids = []
    for e in tree.iterfind('IdList/Id'):
        pmcids.append(e.text)
    return pmcids
def get_categories_from_pmid(pmid):
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=xml' % pmid
    xml_file = _get_file_from_url(url)
    tree = ElementTree()
    tree.parse(xml_file)
    categories = []
    for e in tree.iterfind('PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName'):
        category = _postprocess_category(e.text)
        categories.append(category)
    return categories
def _get_pmcids_from_dois(dois):
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=%s' % \
        '%20OR%20'.join([doi+'[doi]' for doi in dois])
    xml_file = _get_file_from_url(url)
    tree = ElementTree()
    tree.parse(xml_file)
    pmcids = []
    for e in tree.iterfind('IdList/Id'):
        pmcids.append(e.text)
    return pmcids
Exemplo n.º 8
0
def get_categories_from_pmid(pmid):
    """
    Gets MeSH headings, returns those not deemed too broad.
    """
    if not type(pmid) == int:
        raise TypeError, "Cannot get Categories for PMID %s of type %s." % (pmid, type(pmid))
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=xml' % pmid
    xml_file = _get_file_from_url(url)
    tree = ElementTree()
    tree.parse(xml_file)
    categories = []
    for heading in tree.iterfind('PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading'):
        htree = ElementTree(heading)
        descriptor_text = htree.find('DescriptorName').text
        if (htree.find('QualifierName') is not None) or \
            (' ' in descriptor_text and not 'and' in descriptor_text):
            categories.append(descriptor_text)
    return categories
Exemplo n.º 9
0
    def __init__(self):
        self._faces = OrderedDict()
        tree = ElementTree(file=path.face_path + "face.xml")

        category = ""
        for face in tree.iterfind("./FACEINFO/"):
            assert face.tag == "FACE"
            face = FaceItem(face)

            if category != face.category:
                category = face.category
                self._faces[category] = OrderedDict()
            else:
                self._faces[category][face.name] = face

        size = tree.find("./WNDCONFIG/Align")
        self._col, self._row = int(size.get("Col")), int(size.get("Row"))

        self._all_faces_cache = []
Exemplo n.º 10
0
    def __init__(self):
        self._faces = OrderedDict()
        tree = ElementTree(file=path.face_path + "face.xml")

        category = ""
        for face in tree.iterfind("./FACEINFO/"):
            assert face.tag == "FACE"
            face = FaceItem(face)

            if category != face.category:
                category = face.category
                self._faces[category] = OrderedDict()
            else:
                self._faces[category][face.name] = face

        size = tree.find("./WNDCONFIG/Align")
        self._col, self._row = int(size.get("Col")), int(size.get("Row"))

        self._all_faces_cache = []
Exemplo n.º 11
0
from xml.etree.cElementTree import ElementTree

from model import Post

filename = argv[1]
repository_path = './posts'
repository = Repo.init(repository_path)
repository.index.commit('Import from Wordpress')  # Commit is unnecessary.

tree = ElementTree(file=filename)

description = tree.find('.//channel/description').text
repository.description = description

author_emails = {}
for author in tree.iterfind('.//{http://wordpress.org/export/1.2/}author'):
    author_name = author.find('{http://wordpress.org/export/1.2/}author_display_name').text
    author_email = author.find('{http://wordpress.org/export/1.2/}author_email').text
    author_emails[author_name] = author_email

for item in tree.iterfind('.//item'):
    post_type = item.find('{http://wordpress.org/export/1.2/}post_type').text
    status = item.find('{http://wordpress.org/export/1.2/}status').text
    if post_type == 'post' and status == 'publish':
        title = item.find('title').text
        pubdate = item.find('pubDate').text
        creator = item.find('{http://purl.org/dc/elements/1.1/}creator').text
        email = author_emails[creator]
        content = item.find('{http://purl.org/rss/1.0/modules/content/}encoded').text
        html = "<h1>%s</h1>\n%s" % (title, content)
        #comments = [c for c in item.iterfind('{http://wordpress.org/export/1.2/}comment')]