Exemplo n.º 1
0
Arquivo: forms.py Projeto: XI-lab/axel
    def get_collocations_alchemy(self):
        """Extract collocations using the self.cleaned_data dictionary"""
        full_name = handle_uploaded_file(self.cleaned_data['article_pdf'])

        if not os.path.exists(full_name + "x.xml"):
            subprocess.call([settings.PDFX_PATH, full_name])
        extracted_data = parse_pdfx_xml(full_name + "x.xml")

        full_text = nlp.get_full_text(extracted_data)['text']
        payload = {'apikey': 'd0604109bbeb676474b243bc623a0fc1a172437f', 'outputMode': 'json',
                   'maxRetrieve': '100', 'text': full_text}
        response = requests.post('http://access.alchemyapi.com/calls/text/TextGetRankedNamedEntities',
                                 data=payload)
        result = response.json()
        labels = []
        for entity in result['entities']:
            labels.append((entity['text'] + ' (' + entity['type'] + ')', entity['relevance']))
        return labels
Exemplo n.º 2
0
Arquivo: forms.py Projeto: XI-lab/axel
    def get_collocations(self):
        """Extract collocations using the self.cleaned_data dictionary"""
        from axel.stats.models import Collocations
        full_name = handle_uploaded_file(self.cleaned_data['article_pdf'])
        #stem_func = getattr(Stemmer, self.cleaned_data['stem_func'])

        if not os.path.exists(full_name + "x.xml"):
            subprocess.call([settings.PDFX_PATH, full_name])
        extracted_data = parse_pdfx_xml(full_name + "x.xml")

        full_text = nlp.get_full_text(extracted_data)['text']
        article = PDFUploadForm.generate_temp_article(full_text)
        labels = []
        try:
            features = PDFUploadForm.build_features(article)
            for ngram, feature in features:
                klass0 = self.CLF.predict_proba(feature)[0][1]
                labels.append((ngram, klass0))
        finally:
            article.delete()
            Collocations.objects.filter(count=0).delete()
        labels.sort(key=lambda x: x[1], reverse=True)
        return labels