def __init__(self, folder='data/mmimdb-256/dataset-resized-256max', split='dev', image_transform=None): self.json_dir = os.path.join(folder, split, 'metadata') self.image_dir = os.path.join(folder, split, 'images') self.image_transform = image_transform self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.text_extractor = TextExtractor( folder + "/" + split + "/images/", split + "_" + "dataset_text_extract_output.txt", split) #insantiate a model to extract text # Category definitions of movies. self.categories = [ 'Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western' ] self.categories2ids = { category: id for (id, category) in enumerate(self.categories) } # Load JSON files. #print('Loading %s ...' % self.json_dir, end = '') print("extracting text and getting metadata") self.fdir = os.listdir(self.json_dir) self.metadata = [(fname[:-5], json.load(open(os.path.join(self.json_dir, fname)))) for fname in sorted(self.fdir) if not fname.startswith('.')] print(len(self.metadata)) self.text_extractor.extract_text() print(' finished') # Pre-tokenizing all sentences. print('Tokenizing...', end='') self.tokenized_plots = list() for i in range(0, len(self.metadata)): text = self.text_extractor.get_item( i) #self.metadata[i][1]['plot'][0] encoded_text = self.tokenizer.encode_plus( text, add_special_tokens=True, truncation=True, max_length=256, padding='max_length', return_attention_mask=True, return_tensors='pt') self.tokenized_plots.append(encoded_text) print(' finished')
def __init__(self, domain, api_user, api_token, blacklist_file, max_attachment_size, cache_location, start_date: datetime.date): self._cache_location = cache_location self._start_date = start_date self._domain = domain self._text_extractor = TextExtractor() self._repository = ConfluenceRepository( domain, api_user, api_token, max_attachment_size, self._text_extractor.supported_mime_types) self._secret_finder = SecretFinder(blacklist_file)
def test_non_ocr_pdf(self) : """ Access an valid pdf which hasn't been OCR'd """ file_name = 'non_ocr_file.pdf' text_extractor = TextExtractor( source_file = file_name, source_directory= TextExtractorTest.test_directory, working_directory = '/tmp', testing = True) actual_results = text_extractor.get_file_contents_as_array() self.assertEquals(len(actual_results), 0)
def test_valid_pdf(self) : """ Access an empty file with .pdf suffix This could well break when we test the file type properly """ expected_results = [ 'Test 1\n', 'Test 2\n', '\n' ] file_name = 'test_file1.pdf' text_extractor = TextExtractor( source_file = file_name, source_directory= TextExtractorTest.test_directory, working_directory = '/tmp', testing = True) actual_results = text_extractor.get_file_contents_as_array() self.assertEquals(expected_results, actual_results)
class TestTextExtractor(unittest.TestCase): def setUp(self): self.text_extractor = TextExtractor( "https://scraping-for-beginner.herokuapp.com/login_page") self.text_extractor.login("imanishi", "kohei") def test_get_lecturer_info(self): profile, *_ = self.text_extractor.get_lecturer_info() self.assertEqual( { "講師名": "今西 航平", "所属企業": "株式会社キカガク", "生年月日": "1994年7月15日", "出身": "千葉県", "趣味": "バスケットボール、読書、ガジェット集め" }, profile) def test_export_csv(self): _, keys, vals = self.text_extractor.get_lecturer_info() self.text_extractor.export_csv(keys, vals, "../csv/lecturer_info.csv") self.assertEqual(True, path.exists("../csv/lecturer_info.csv"))
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # import json from text_extractor import TextExtractor from document_analyzer import DocumentAnalyzer from document_indexer import DocumentIndexer document_indexer = DocumentIndexer() document_analyzer = DocumentAnalyzer() text_extractor = TextExtractor() def handler(event, context): message = json.loads(event['Records'][0]['Sns']['Message']) jobId = message['JobId'] print("JobId=" + jobId) status = message['Status'] print("Status=" + status) if status != "SUCCEEDED": return { # TODO : handle error with Dead letter queue (not in this workshop) # https://docs.aws.amazon.com/lambda/latest/dg/dlq.html
def setUp(self): self.text_extractor = TextExtractor( "https://scraping-for-beginner.herokuapp.com/login_page") self.text_extractor.login("imanishi", "kohei")
from text_extractor import TextExtractor from and_other_pattern_matcher import AndOtherPatternMatcher from such_as_pattern_matcher import SuchAsPatternMatcher from or_other_pattern_matcher import OrOtherPatternMatcher from including_pattern_matcher import IncludingPatternMatcher from especially_pattern_matcher import EspeciallyPatternMatcher from text_extractor_pipe import TextExtractorPipe from knowledge_graph import KnowledgeGraph from matcher_pipe import MatcherPipe import spacy textExtractor1 = TextExtractor("WWII", "Q362") textExtractor1.extract() textExtractor2 = TextExtractor("London", "Q84") textExtractor2.extract() textExtractor3 = TextExtractor("Paris", "Q90") textExtractor3.extract() textExtractor4 = TextExtractor("World War I", "Q361") textExtractor4.extract() textExtractorPipe = TextExtractorPipe() textExtractorPipe.addTextExtractor(textExtractor1) textExtractorPipe.addTextExtractor(textExtractor2) textExtractorPipe.addTextExtractor(textExtractor3) textExtractorPipe.addTextExtractor(textExtractor4) nlp = spacy.load('en_core_web_sm') nlp.add_pipe(nlp.create_pipe('sentencizer')) # updated doc = nlp(textExtractorPipe.extract()) andOtherPatternMatcher = AndOtherPatternMatcher(nlp) suchAsMatcher = SuchAsPatternMatcher(nlp)
class MovieDataset(torch.utils.data.Dataset): def __init__(self, folder='data/mmimdb-256/dataset-resized-256max', split='dev', image_transform=None): self.json_dir = os.path.join(folder, split, 'metadata') self.image_dir = os.path.join(folder, split, 'images') self.image_transform = image_transform self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.text_extractor = TextExtractor( folder + "/" + split + "/images/", split + "_" + "dataset_text_extract_output.txt", split) #insantiate a model to extract text # Category definitions of movies. self.categories = [ 'Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western' ] self.categories2ids = { category: id for (id, category) in enumerate(self.categories) } # Load JSON files. #print('Loading %s ...' % self.json_dir, end = '') print("extracting text and getting metadata") self.fdir = os.listdir(self.json_dir) self.metadata = [(fname[:-5], json.load(open(os.path.join(self.json_dir, fname)))) for fname in sorted(self.fdir) if not fname.startswith('.')] print(len(self.metadata)) self.text_extractor.extract_text() print(' finished') # Pre-tokenizing all sentences. print('Tokenizing...', end='') self.tokenized_plots = list() for i in range(0, len(self.metadata)): text = self.text_extractor.get_item( i) #self.metadata[i][1]['plot'][0] encoded_text = self.tokenizer.encode_plus( text, add_special_tokens=True, truncation=True, max_length=256, padding='max_length', return_attention_mask=True, return_tensors='pt') self.tokenized_plots.append(encoded_text) print(' finished') def __getitem__(self, index: int): # Load images on the fly. filename, movie_data = self.metadata[index] img_path = os.path.join(self.image_dir, filename + '.jpeg') image = Image.open(img_path).convert('RGB') #TODO: ADD cacheing text = self.tokenized_plots[index]['input_ids'][0] text_mask = self.tokenized_plots[index]['attention_mask'][0] genres = movie_data['genres'] if self.image_transform: image = self.image_transform(image) # Encode labels in a binary vector. label_vector = torch.zeros((len(self.categories))) label_ids = [self.categories2ids[cat] for cat in genres] label_vector[label_ids] = 1 return image, text, text_mask, label_vector def load_image_only(self, index: int): filename, movie_data = self.metadata[index] img_path = os.path.join(self.image_dir, filename + '.jpeg') image = Image.open(img_path).convert('RGB') return image def __len__(self): return len(self.metadata)
class App(object): def __init__(self, domain, api_user, api_token, blacklist_file, max_attachment_size, cache_location, start_date: datetime.date): self._cache_location = cache_location self._start_date = start_date self._domain = domain self._text_extractor = TextExtractor() self._repository = ConfluenceRepository( domain, api_user, api_token, max_attachment_size, self._text_extractor.supported_mime_types) self._secret_finder = SecretFinder(blacklist_file) def __enter__(self): if self._cache_location: cache_path = self._cache_location else: current_folder = os.path.dirname(os.path.realpath(__file__)) cache_path = os.path.join(current_folder, "cache.sqlite") self._cache = Cache(cache_path, self._domain) return self def __exit__(self, *args): self._cache.close() def get_secrets_from_versions(self, content, start_version) -> Iterable[VersionSecrets]: for version in self._repository.get_versions(content): if version.id <= start_version: continue version_content = self._text_extractor.extract_text_from_version( content, version) secrets = set() for secret in self._secret_finder.find_secrets(version_content): secrets.add(secret) if any(secrets): yield VersionSecrets(content, version, secrets) def find_secrets_from_date(self, date) -> Iterable[VersionSecrets]: today = datetime.datetime.now().date() while date <= today: logging.info(f"Fetching changes for {date}...") for content in self._repository.get_content_for_date(date): crawl_history = self._cache.get_crawl_history(content.id) if crawl_history: new_version_secrets = [] if crawl_history.latest_version != content.latest_version: logging.info( f"Fetching versions {crawl_history.latest_version}-{content.latest_version} from {content}..." ) new_version_secrets = list( self.get_secrets_from_versions( content, crawl_history.latest_version)) else: logging.info( f"Fetching {content.latest_version} versions from {content}..." ) new_version_secrets = list( self.get_secrets_from_versions(content, 0)) crawl_history = ContentCrawlHistory() for version_secrets in new_version_secrets: version_secrets.secrets = [ s for s in version_secrets.secrets if s not in crawl_history.secrets ] crawl_history.secrets.extend(version_secrets.secrets) crawl_history.latest_version = content.latest_version self._cache.set_crawl_history(content.id, crawl_history) for s in new_version_secrets: if any(s.secrets): yield s self._cache.set_last_crawl_date(date) date += datetime.timedelta(days=1) def find_secrets(self) -> Iterable[VersionSecrets]: for s in self.find_secrets_from_date(self._get_start_date()): yield s def _get_start_date(self) -> datetime.date: if self._start_date: return self._start_date cached_date = self._cache.get_last_crawl_date() if cached_date: return cached_date return self._repository.get_oldest_content_creation_date()
from sty import fg, bg, ef, rs from question_processor import QuestionProcessor from text_extractor import TextExtractor from text_extractor_pipe import TextExtractorPipe from context_retriever import ContextRetriever from answer_retriever import AnswerRetriever from find_keywords import FindKeywords # STEP 1: Extract keywords from the question print(fg.green + "Please enter your question here: " + fg.rs) question = input() getKeywords = FindKeywords(question) key_word = getKeywords.distill() # STEP 2: Download text from wikipedia textExtractor = TextExtractor(key_word, "1") textExtractor.extract() textExtractorPipe = TextExtractorPipe() textExtractorPipe.addTextExtractor(textExtractor) # STEP 3: Retrieve corpus from the text. nlp = spacy.load('en_core_web_sm') nlp.add_pipe('sentencizer') doc = nlp(textExtractorPipe.extract()) sentences = [sent.text.strip() for sent in doc.sents] questionProcessor = QuestionProcessor(nlp) contextRetriever = ContextRetriever(nlp, 3) questionContext = contextRetriever.getContext(sentences, questionProcessor.process(question)) # STEP 4: Retrieve answer from the corpus. answerRetriever = AnswerRetriever()
def can_analyze(file_name): ext = TextExtractor.get_extension(file_name) return ext.lower() in [ '.html', '.htm' ]
def test_file_extension_no_extension(): assert '' == TextExtractor.get_extension('dolphin')
def test_file_extension(): assert '.png' == TextExtractor.get_extension('dolphin.png')