def __init__(self, file_name, url, label): file_stream = file(file_name) DocumentParser.__init__(self, file_stream) self._url = url self._label = label self._page_id_to_page_numbers = None self._doc = poppler.PDFDoc(file_name)
def __init__(self, file_stream, url): DocumentParser.__init__(self, file_stream) self._url = url self._namespace_URI = 'http://www.loc.gov/METS/' self._mods_namespace_URI = 'http://www.loc.gov/mods/v3' #read the content of the file self._content_str = self._file_stream.read() self._logical_structure = None self._physical_structure = None self._meta_data = None self._relation = None self._file_list = None #some METS files contain uppercase mets directive #self._content_str = self._content_str.replace('METS=', 'mets=') #self._content_str = self._content_str.replace('', '') #self._content_str = self._content_str.replace('MODS=', 'mods=') #self._content_str = self._content_str.replace('', '') try: self._doc = parseString(self._content_str) except Exception: raise ParserError.InvalidDocument("The file is invalid. (is it" \ "corrupted?)") if self._check_xml() is not True: raise ParserError.InvalidDocument("The file is invalid. (is it" \ "corrupted?)")
def __init__(self, file_stream, url, label, mime): DocumentParser.__init__(self, file_stream) self._url = url self._label = label self._mime = mime img = Image.open(file_stream) (self._width, self._height) = img.size
def __init__(self, file_stream, url): DocumentParser.__init__(self, file_stream) self._url = url
def __init__(self, file_stream, url): self._namespace_URI = 'http://purl.org/dc/elements/1.1/' DocumentParser.__init__(self, file_stream) self._url = url
from parser import DocumentParser from store import DocumentStoreFactory, TermStoreFactory from index import IndexFactory from parser import TextParser url1 = "https://stackoverflow.com/questions/9626535/get-domain-name-from-url" text1 = "Extracting domain from URL in python" text2 = "How to Get Domain Name from URL String domain in Python" text3 = "How to automatically extract domain from URL through conf files at search-time" url3 = "https://answers.splunk.com/answers/188774/how-to-automatically-extract-domain-from-url-throu.html" url2 = "https://ashiknesin.com" doc1 = DocumentParser.parse_document(url1, text1) doc2 = DocumentParser.parse_document(url2, text2) doc3 = DocumentParser.parse_document(url3, text3) doc_store = DocumentStoreFactory.get_store() print(doc_store._data) index = IndexFactory.get_or_create_index("default") index.add_document(doc1) index.add_document(doc2) index.add_document(doc3) index.display() from query import QueryEvaluator qeval = QueryEvaluator(IndexFactory, TermStoreFactory)