示例#1
0
 def __init__(self, file_name, url, label):
     file_stream = file(file_name)
     DocumentParser.__init__(self, file_stream)
     self._url = url
     self._label = label
     self._page_id_to_page_numbers = None
     self._doc = poppler.PDFDoc(file_name)
    def __init__(self, file_stream, url):
        DocumentParser.__init__(self, file_stream)
        self._url = url
        self._namespace_URI = 'http://www.loc.gov/METS/'
        self._mods_namespace_URI = 'http://www.loc.gov/mods/v3'

        #read the content of the file
        self._content_str = self._file_stream.read()
        
        self._logical_structure = None
        self._physical_structure = None
        self._meta_data = None
        self._relation = None
        self._file_list = None

        #some METS files contain uppercase mets directive
        #self._content_str = self._content_str.replace('METS=', 'mets=')
        #self._content_str = self._content_str.replace('', '')
        #self._content_str = self._content_str.replace('MODS=', 'mods=')
        #self._content_str = self._content_str.replace('', '')
        try:
            self._doc = parseString(self._content_str)
        except Exception:
            raise ParserError.InvalidDocument("The file is invalid. (is it" \
                    "corrupted?)")
        if self._check_xml() is not True:
            raise ParserError.InvalidDocument("The file is invalid. (is it" \
                    "corrupted?)")
示例#3
0
 def __init__(self, file_stream, url, label, mime):
     DocumentParser.__init__(self, file_stream)
     self._url = url
     self._label = label
     self._mime = mime
     img = Image.open(file_stream)
     (self._width, self._height) = img.size
示例#4
0
 def __init__(self, file_name, url, label):
     file_stream = file(file_name)
     DocumentParser.__init__(self, file_stream)
     self._url = url
     self._label = label
     self._page_id_to_page_numbers = None
     self._doc = poppler.PDFDoc(file_name)
 def __init__(self, file_stream, url, label, mime):
     DocumentParser.__init__(self, file_stream)
     self._url = url
     self._label = label
     self._mime = mime
     img = Image.open(file_stream)
     (self._width, self._height) = img.size
示例#6
0
 def __init__(self, file_stream, url):
     DocumentParser.__init__(self, file_stream)
     self._url = url
示例#7
0
 def __init__(self, file_stream, url):
     self._namespace_URI = 'http://purl.org/dc/elements/1.1/'
     DocumentParser.__init__(self, file_stream)
     self._url = url
示例#8
0
 def __init__(self, file_stream, url):
     DocumentParser.__init__(self, file_stream)
     self._url = url
示例#9
0
 def __init__(self, file_stream, url):
     self._namespace_URI = 'http://purl.org/dc/elements/1.1/'
     DocumentParser.__init__(self, file_stream)
     self._url = url
示例#10
0
from parser import DocumentParser
from store import DocumentStoreFactory, TermStoreFactory
from index import IndexFactory
from parser import TextParser

url1 = "https://stackoverflow.com/questions/9626535/get-domain-name-from-url"

text1 = "Extracting domain from URL in python"
text2 = "How to Get Domain Name from URL String domain in Python"
text3 = "How to automatically extract domain from URL through conf files at search-time"
url3 = "https://answers.splunk.com/answers/188774/how-to-automatically-extract-domain-from-url-throu.html"
url2 = "https://ashiknesin.com"

doc1 = DocumentParser.parse_document(url1, text1)
doc2 = DocumentParser.parse_document(url2, text2)
doc3 = DocumentParser.parse_document(url3, text3)

doc_store = DocumentStoreFactory.get_store()
print(doc_store._data)

index = IndexFactory.get_or_create_index("default")

index.add_document(doc1)
index.add_document(doc2)
index.add_document(doc3)

index.display()

from query import QueryEvaluator

qeval = QueryEvaluator(IndexFactory, TermStoreFactory)