def list_articles(target_directory, supplementary_materials=False, skip=[]): listing = listdir(target_directory) for filename in listing: result_tree = ElementTree() result_tree.parse(path.join(target_directory, filename)) for tree in result_tree.iterfind('article'): pmcid = _get_pmcid(tree) if pmcid in skip: continue result = {} result['name'] = pmcid result['doi'] = _get_article_doi(tree) result['article-categories'] = _get_article_categories(tree) result['article-contrib-authors'] = _get_article_contrib_authors(tree) result['article-title'] = _get_article_title(tree) result['article-abstract'] = _get_article_abstract(tree) result['journal-title'] = _get_journal_title(tree) result['article-year'], \ result['article-month'], \ result['article-day'] = _get_article_date(tree) result['article-url'] = _get_article_url(tree) result['article-license-url'], \ result['article-license-text'], \ result['article-copyright-statement'] = _get_article_licensing(tree) result['article-copyright-holder'] = _get_article_copyright_holder(tree) if supplementary_materials: result['supplementary-materials'] = _get_supplementary_materials(tree) yield result
def list_articles(target_directory, supplementary_materials=False, skip=[]): listing = listdir(target_directory) for filename in listing: result_tree = ElementTree() result_tree.parse(path.join(target_directory, filename)) for tree in result_tree.iterfind('article'): pmcid = _get_pmcid(tree) if pmcid in skip: continue result = {} result['name'] = pmcid result['doi'] = _get_article_doi(tree) result['article-categories'] = _get_article_categories(tree) result['article-contrib-authors'] = _get_article_contrib_authors( tree) result['article-title'] = _get_article_title(tree) result['article-abstract'] = _get_article_abstract(tree) result['journal-title'] = _get_journal_title(tree) result['article-year'], \ result['article-month'], \ result['article-day'] = _get_article_date(tree) result['article-url'] = _get_article_url(tree) result['article-license-url'], \ result['article-license-text'], \ result['article-copyright-statement'] = _get_article_licensing(tree) result['article-copyright-holder'] = _get_article_copyright_holder( tree) if supplementary_materials: result[ 'supplementary-materials'] = _get_supplementary_materials( tree) yield result
def _py_gnc_import_get_split_online_id(session,split): if not hasattr(_py_gnc_import_get_split_online_id, "cache"): _py_gnc_import_get_split_online_id.cache={} if not session in _py_gnc_import_get_split_online_id.cache: cache={} f=open(session.get_file_path(),'rb') if f: if (f.read(2) == '\x1f\x8b'): f.seek(0) f=gzip.GzipFile(fileobj=f) else: f.seek(0) xml=ElementTree(file=f) f.close() ns={"slot":"http://www.gnucash.org/XML/slot", "split":"http://www.gnucash.org/XML/split", "trn":"http://www.gnucash.org/XML/trn"} for el_split in xml.iterfind('.//trn:split',ns): for el_slot in el_split.iterfind('split:slots/slot',ns): el_key=el_slot.find('slot:key',ns) if not el_key is None and el_key.text=="online_id": el_guid=el_split.find("split:id[@type='guid']",ns) el_val=el_slot.find('slot:value',ns) if not el_guid is None and not el_val is None: cache[el_guid.text]=el_val.text _py_gnc_import_get_split_online_id.cache[session]=cache if session in _py_gnc_import_get_split_online_id.cache: cache=_py_gnc_import_get_split_online_id.cache[session] guid=split.GetGUID().to_string() if guid in cache: return cache[guid] return None
def get_major_category_from_pmid(pmid): url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=xml' % pmid xml_file = _get_file_from_url(url) tree = ElementTree() tree.parse(xml_file) for e in tree.iterfind('PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName'): if e.attrib['MajorTopicYN'] == 'Y': return _postprocess_category(e.text)
def _get_pmcids_from_dois(dois): url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=%s' % \ '%20OR%20'.join([doi+'[doi]' for doi in dois]) xml_file = _get_file_from_url(url) tree = ElementTree() tree.parse(xml_file) pmcids = [] for e in tree.iterfind('IdList/Id'): pmcids.append(e.text) return pmcids
def get_categories_from_pmid(pmid): url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=xml' % pmid xml_file = _get_file_from_url(url) tree = ElementTree() tree.parse(xml_file) categories = [] for e in tree.iterfind('PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName'): category = _postprocess_category(e.text) categories.append(category) return categories
def _get_pmcids_from_dois(dois): url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=%s' % \ '%20OR%20'.join([doi+'[doi]' for doi in dois]) xml_file = _get_file_from_url(url) tree = ElementTree() tree.parse(xml_file) pmcids = [] for e in tree.iterfind('IdList/Id'): pmcids.append(e.text) return pmcids
def get_categories_from_pmid(pmid): """ Gets MeSH headings, returns those not deemed too broad. """ if not type(pmid) == int: raise TypeError, "Cannot get Categories for PMID %s of type %s." % (pmid, type(pmid)) url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=xml' % pmid xml_file = _get_file_from_url(url) tree = ElementTree() tree.parse(xml_file) categories = [] for heading in tree.iterfind('PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading'): htree = ElementTree(heading) descriptor_text = htree.find('DescriptorName').text if (htree.find('QualifierName') is not None) or \ (' ' in descriptor_text and not 'and' in descriptor_text): categories.append(descriptor_text) return categories
def __init__(self): self._faces = OrderedDict() tree = ElementTree(file=path.face_path + "face.xml") category = "" for face in tree.iterfind("./FACEINFO/"): assert face.tag == "FACE" face = FaceItem(face) if category != face.category: category = face.category self._faces[category] = OrderedDict() else: self._faces[category][face.name] = face size = tree.find("./WNDCONFIG/Align") self._col, self._row = int(size.get("Col")), int(size.get("Row")) self._all_faces_cache = []
def __init__(self): self._faces = OrderedDict() tree = ElementTree(file=path.face_path + "face.xml") category = "" for face in tree.iterfind("./FACEINFO/"): assert face.tag == "FACE" face = FaceItem(face) if category != face.category: category = face.category self._faces[category] = OrderedDict() else: self._faces[category][face.name] = face size = tree.find("./WNDCONFIG/Align") self._col, self._row = int(size.get("Col")), int(size.get("Row")) self._all_faces_cache = []
from xml.etree.cElementTree import ElementTree from model import Post filename = argv[1] repository_path = './posts' repository = Repo.init(repository_path) repository.index.commit('Import from Wordpress') # Commit is unnecessary. tree = ElementTree(file=filename) description = tree.find('.//channel/description').text repository.description = description author_emails = {} for author in tree.iterfind('.//{http://wordpress.org/export/1.2/}author'): author_name = author.find('{http://wordpress.org/export/1.2/}author_display_name').text author_email = author.find('{http://wordpress.org/export/1.2/}author_email').text author_emails[author_name] = author_email for item in tree.iterfind('.//item'): post_type = item.find('{http://wordpress.org/export/1.2/}post_type').text status = item.find('{http://wordpress.org/export/1.2/}status').text if post_type == 'post' and status == 'publish': title = item.find('title').text pubdate = item.find('pubDate').text creator = item.find('{http://purl.org/dc/elements/1.1/}creator').text email = author_emails[creator] content = item.find('{http://purl.org/rss/1.0/modules/content/}encoded').text html = "<h1>%s</h1>\n%s" % (title, content) #comments = [c for c in item.iterfind('{http://wordpress.org/export/1.2/}comment')]