def extract_references(self, req, form):
        """Refrences extraction page

        This page can be used for authors to test their pdfs against our
        refrences extraction process"""
        user_info = collect_user_info(req)

        # Handle the 3 POST parameters
        if 'pdf' in form and form['pdf'].value:
            pdf = form['pdf'].value
            references_xml = extract_from_pdf_string(pdf)
        elif 'arxiv' in form and form['arxiv'].value:
            url = make_arxiv_url(arxiv_id=form['arxiv'].value)
            references_xml = extract_references_from_url_xml(url)
        elif 'url' in form and form['url'].value:
            url = form['url'].value
            references_xml = extract_references_from_url_xml(url)
        elif 'txt' in form and form['txt'].value:
            txt = form['txt'].value
            references_xml = extract_references_from_string_xml(txt)
        else:
            references_xml = None

        # If we have not uploaded anything yet
        # Display the form that allows us to do so
        if not references_xml:
            out = self.extract_references_template()
        else:
            out = """
            <style type="text/css">
                #referenceinp_link { display: none; }
            </style>
            """
            out += format_record(0,
                                'hdref',
                                xml_record=references_xml.encode('utf-8'),
                                user_info=user_info)

        # Render the page (including header, footer)
        return page(title='References Extractor',
                    body=out,
                    uid=user_info['uid'],
                    req=req)
    def extract_references(self, req, form):
        """Refrences extraction page

        This page can be used for authors to test their pdfs against our
        refrences extraction process"""
        user_info = collect_user_info(req)

        # Handle the 3 POST parameters
        if 'pdf' in form and form['pdf'].value:
            pdf = form['pdf'].value
            references_xml = extract_from_pdf_string(pdf)
        elif 'arxiv' in form and form['arxiv'].value:
            url = make_arxiv_url(arxiv_id=form['arxiv'].value)
            references_xml = extract_references_from_url_xml(url)
        elif 'url' in form and form['url'].value:
            url = form['url'].value
            references_xml = extract_references_from_url_xml(url)
        elif 'txt' in form and form['txt'].value:
            txt = form['txt'].value
            references_xml = extract_references_from_string_xml(txt)
        else:
            references_xml = None

        # If we have not uploaded anything yet
        # Display the form that allows us to do so
        if not references_xml:
            out = self.extract_references_template()
        else:
            out = """
            <style type="text/css">
                #referenceinp_link { display: none; }
            </style>
            """
            out += format_record(0,
                                 'hdref',
                                 xml_record=references_xml.encode('utf-8'),
                                 user_info=user_info)

        # Render the page (including header, footer)
        return page(title='References Extractor',
                    body=out,
                    uid=user_info['uid'],
                    req=req)
Пример #3
0
    def extract(self, req, form):
        """Refrences extraction page

        This page can be used for authors to test their pdfs against our
        refrences extraction process"""
        user_info = collect_user_info(req)

        # Handle the 3 POST parameters
        if 'pdf' in form and form['pdf'].value:
            pdf = form['pdf'].value.strip()
            references_xml = extract_from_pdf_string(pdf)
        elif 'arxiv' in form and form['arxiv'].value:
            url = make_arxiv_url(arxiv_id=form['arxiv'].value.strip())
            references_xml = extract_references_from_url_xml(url)
        elif 'url' in form and form['url'].value:
            url = form['url'].value.strip()
            try:
                references_xml = extract_references_from_url_xml(url)
            except (FullTextNotAvailable, ConnectionError, HTTPError, Timeout):
                references_xml = None
        elif 'txt' in form and form['txt'].value:
            txt = form['txt'].value.decode('utf-8', 'ignore')
            references_xml = extract_references_from_string_xml(txt)
        else:
            references_xml = None

        # If we have not uploaded anything yet
        # Display the form that allows us to do so
        if not references_xml:
            out = docextract_templates.tmpl_web_form()
        else:
            references_html = format_record(0,
                                            'hdref',
                                            xml_record=references_xml,
                                            user_info=user_info)
            out = docextract_templates.tmpl_web_result(references_html)

        # Render the page (including header, footer)
        return page(title='References Extractor',
                    body=out,
                    uid=user_info['uid'],
                    req=req)
    def extract_references_pdf_url(self, req, form):
        """Extract references from the pdf pointed by the passed url"""
        check_login(req)

        if 'url' not in form:
            return 'No URL specified'

        url = form['url'].value

        if not check_url(url):
            return 'Invalid URL specified'

        return extract_references_from_url_xml(url)
    def extract_references_pdf_url(self, req, form):
        """Extract references from the pdf pointed by the passed url"""
        check_login(req)

        if 'url' not in form:
            return 'No URL specified'

        url = form['url'].value

        if not check_url(url):
            return 'Invalid URL specified'

        return extract_references_from_url_xml(url)
    def extract(self, req, form):
        """Refrences extraction page

        This page can be used for authors to test their pdfs against our
        refrences extraction process"""
        user_info = collect_user_info(req)

        # Handle the 3 POST parameters
        if 'pdf' in form and form['pdf'].value:
            pdf = form['pdf'].value
            references_xml = extract_from_pdf_string(pdf)
        elif 'arxiv' in form and form['arxiv'].value:
            url = make_arxiv_url(arxiv_id=form['arxiv'].value)
            references_xml = extract_references_from_url_xml(url)
        elif 'url' in form and form['url'].value:
            url = form['url'].value
            references_xml = extract_references_from_url_xml(url)
        elif 'txt' in form and form['txt'].value:
            txt = form['txt'].value.decode('utf-8', 'ignore')
            references_xml = extract_references_from_string_xml(txt)
        else:
            references_xml = None

        # If we have not uploaded anything yet
        # Display the form that allows us to do so
        if not references_xml:
            out = docextract_templates.tmpl_web_form()
        else:
            references_html = format_record(0,
                                           'hdref',
                                            xml_record=references_xml,
                                            user_info=user_info)
            out = docextract_templates.tmpl_web_result(references_html)

        # Render the page (including header, footer)
        return page(title='References Extractor',
                    body=out,
                    uid=user_info['uid'],
                    req=req)
Пример #7
0
def replace_references(recid, uid=None, txt=None, url=None):
    """Replace references for a record

    The record itself is not updated, the marc xml of the document with updated
    references is returned

    Parameters:
    * recid: the id of the record
    * txt: references in text mode
    * inspire: format of ther references
    """
    # Parse references
    if txt is not None:
        references_xml = extract_references_from_string_xml(
            txt, is_only_references=True)
    elif url is not None:
        references_xml = extract_references_from_url_xml(url)
    else:
        references_xml = extract_references_from_record_xml(recid)
    references = create_record(references_xml.encode('utf-8'))

    dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_file_contents(
        recid, uid)
    out_xml = None

    references_to_add = record_get_field_instances(references[0],
                                                   tag='999',
                                                   ind1='C',
                                                   ind2='5')
    refextract_status = record_get_field_instances(references[0],
                                                   tag='999',
                                                   ind1='C',
                                                   ind2='6')

    if references_to_add:
        # Replace 999 fields
        record_delete_fields(record, '999')
        record_add_fields(record, '999', references_to_add)
        record_add_fields(record, '999', refextract_status)
        # Update record references
        out_xml = record_xml_output(record)

    return out_xml
Пример #8
0
def replace_references(recid, uid=None, txt=None, url=None):
    """Replace references for a record

    The record itself is not updated, the marc xml of the document with updated
    references is returned

    Parameters:
    * recid: the id of the record
    * txt: references in text mode
    * inspire: format of ther references
    """
    # Parse references
    if txt is not None:
        references_xml = extract_references_from_string_xml(txt, is_only_references=True)
    elif url is not None:
        references_xml = extract_references_from_url_xml(url)
    else:
        references_xml = extract_references_from_record_xml(recid)
    references = create_record(references_xml)

    dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_contents(recid, uid)
    out_xml = None

    references_to_add = record_get_field_instances(references[0],
                                                   tag='999',
                                                   ind1='C',
                                                   ind2='5')
    refextract_status = record_get_field_instances(references[0],
                                                   tag='999',
                                                   ind1='C',
                                                   ind2='6')

    if references_to_add:
        # Replace 999 fields
        record_delete_fields(record, '999')
        record_add_fields(record, '999', references_to_add)
        record_add_fields(record, '999', refextract_status)
        # Update record references
        out_xml = record_xml_output(record)

    return out_xml
Пример #9
0
    def extract(self, req, form):
        """Refrences extraction page

        This page can be used for authors to test their pdfs against our
        refrences extraction process"""
        user_info = collect_user_info(req)
        plots = None
        list_image_names = []
        list_caption = []
        plots_dir = os.path.join(CFG_PREFIX, "var/www/img/plots/")
        # unique folder name
        # Handle the 3 POST parameters
        if 'pdf' in form and form['pdf'].value:
            pdf = form['pdf'].value
            references_xml = extract_from_pdf_string(pdf)
            
            pdf_string = form['pdf'].file.read()
            pdf = safe_mkstemp('extract.pdf')
            f = open(pdf, 'w')
            f.write(pdf_string)
            f.close()

            plots = 'File pdf: ' + str(pdf) + '<br />'
            (exit_code, output_buffer, stderr_output_buffer) = run_shell_command(CFG_PDFPLOTEXTRACTOR_PATH + ' ' + pdf)
            plotextracted_pdf_path = pdf + ".extracted/extracted.json"

            code, figures, extracted = merging_articles(None, plotextracted_pdf_path)
            id_fulltext = ""
            marc_path = create_MARCXML(figures, id_fulltext, code, extracted, write_file=True)
            plots += marc_path + '<br />'

            f = open (marc_path, 'r')
            record_xml = f.read()
            f.close()
            
            #plots_dir = "/opt/invenio/var/www/img/plots/"
            if os.path.exists(plots_dir):
                shutil.rmtree(plots_dir)
            os.mkdir(plots_dir)

            re_list = REGEXP_RECORD.findall(record_xml)
            for r in re_list:
                re_subfield = REGEXP_SUBFIELD_A.findall(r)
                for index, image_path in enumerate(re_subfield):
                    if index == 0:
                        run_shell_command('cp ' + image_path + ' ' + plots_dir)

        elif 'arxiv' in form and form['arxiv'].value:
            plots = ""
            url_pdf = make_arxiv_url(arxiv_id=form['arxiv'].value)
            references_xml = extract_references_from_url_xml(url_pdf)
            url_tarball = make_arxiv_tar_url(arxiv_id=form['arxiv'].value)
 
            plotextracted_xml_path, plotextracted_pdf_path = extract_plots_from_latex_and_pdf(url_tarball, url_pdf)
            plots += 'TAR: ' + plotextracted_xml_path + '<br />'
            plots += 'PDF: ' + plotextracted_pdf_path + '<br />'
            
           
	    '''
	    code, figures, extracted = merging_latex_pdf(plotextracted_xml_path, None, "", )
            id_fulltext = ""
            marc_path = create_MARCXML(figures, id_fulltext, code, extracted, write_file=True)
	    '''
	    dest_dir = os.path.join(CFG_TMPDIR, 'textmining')
	    try:
		os.mkdir(dest_dir)
	    except OSError:
		pass
	    code, message, figures, marc_path = merging_latex_pdf(plotextracted_xml_path, "", "", dest_dir)



            plots += 'OUTPUT: ' + marc_path + '<br />'

            f = open (marc_path, 'r')
            record_xml = f.read()
            f.close()
            
            if os.path.exists(plots_dir):
                shutil.rmtree(plots_dir)
            os.mkdir(plots_dir)

            re_list = REGEXP_RECORD.findall(record_xml)
            for r in re_list:
                re_subfield = REGEXP_SUBFIELD_A.findall(r)
                re_subfield_caption = REGEXP_SUBFIELD_D.findall(r) 
                for index, image_path in enumerate(re_subfield):
                    if index == 0:
                        run_shell_command('cp ' + image_path + ' ' + plots_dir)
                        list_image_names.append(os.path.split(image_path)[1])
                        list_caption.append(re_subfield_caption[index])
        
        elif 'url' in form and form['url'].value:
            url = form['url'].value
            references_xml = extract_references_from_url_xml(url)
            plots = "ME3"
        elif 'txt' in form and form['txt'].value:
            txt = form['txt'].value
            references_xml = extract_references_from_string_xml(txt)
        else:
            references_xml = None

        # If we have not uploaded anything yet
        # Display the form that allows us to do so
        if not references_xml:
            out = self.extract_references_template()
        else:
            out = """
            <style type="text/css">
                #referenceinp_link { display: none; }
                /*img.plot { width: 250px; height: 250px; }*/
            </style>
            """
            out += format_record(0,
                                'hdref',
                                xml_record=references_xml.encode('utf-8'),
                                user_info=user_info)
            if plots:
                out += "<h2>Plots</h2>"
                out += plots
                dirList = os.listdir(plots_dir)
                
                for i, fname in enumerate(dirList):
                    out += '<h3>Figure ' + str(i+1) + '</h3> <p><img src="/img/plots/' + fname + '" class="plot"></p>'
                    index = list_image_names.index(fname)
                    out += '<p>' + list_caption[index] + '</p>'

        # Render the page (including header, footer)
        return page(title='Document Extractor',
                    body=out,
                    uid=user_info['uid'],
                    req=req)