def __init__(self, folder='data/mmimdb-256/dataset-resized-256max', split='dev', image_transform=None): self.json_dir = os.path.join(folder, split, 'metadata') self.image_dir = os.path.join(folder, split, 'images') self.image_transform = image_transform self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.text_extractor = TextExtractor( folder + "/" + split + "/images/", split + "_" + "dataset_text_extract_output.txt", split) #insantiate a model to extract text # Category definitions of movies. self.categories = [ 'Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western' ] self.categories2ids = { category: id for (id, category) in enumerate(self.categories) } # Load JSON files. #print('Loading %s ...' % self.json_dir, end = '') print("extracting text and getting metadata") self.fdir = os.listdir(self.json_dir) self.metadata = [(fname[:-5], json.load(open(os.path.join(self.json_dir, fname)))) for fname in sorted(self.fdir) if not fname.startswith('.')] print(len(self.metadata)) self.text_extractor.extract_text() print(' finished') # Pre-tokenizing all sentences. print('Tokenizing...', end='') self.tokenized_plots = list() for i in range(0, len(self.metadata)): text = self.text_extractor.get_item( i) #self.metadata[i][1]['plot'][0] encoded_text = self.tokenizer.encode_plus( text, add_special_tokens=True, truncation=True, max_length=256, padding='max_length', return_attention_mask=True, return_tensors='pt') self.tokenized_plots.append(encoded_text) print(' finished')
def __init__(self, domain, api_user, api_token, blacklist_file, max_attachment_size, cache_location, start_date: datetime.date): self._cache_location = cache_location self._start_date = start_date self._domain = domain self._text_extractor = TextExtractor() self._repository = ConfluenceRepository( domain, api_user, api_token, max_attachment_size, self._text_extractor.supported_mime_types) self._secret_finder = SecretFinder(blacklist_file)
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # import json from text_extractor import TextExtractor from document_analyzer import DocumentAnalyzer from document_indexer import DocumentIndexer document_indexer = DocumentIndexer() document_analyzer = DocumentAnalyzer() text_extractor = TextExtractor() def handler(event, context): message = json.loads(event['Records'][0]['Sns']['Message']) jobId = message['JobId'] print("JobId=" + jobId) status = message['Status'] print("Status=" + status) if status != "SUCCEEDED": return { # TODO : handle error with Dead letter queue (not in this workshop) # https://docs.aws.amazon.com/lambda/latest/dg/dlq.html
def setUp(self): self.text_extractor = TextExtractor( "https://scraping-for-beginner.herokuapp.com/login_page") self.text_extractor.login("imanishi", "kohei")
from text_extractor import TextExtractor from and_other_pattern_matcher import AndOtherPatternMatcher from such_as_pattern_matcher import SuchAsPatternMatcher from or_other_pattern_matcher import OrOtherPatternMatcher from including_pattern_matcher import IncludingPatternMatcher from especially_pattern_matcher import EspeciallyPatternMatcher from text_extractor_pipe import TextExtractorPipe from knowledge_graph import KnowledgeGraph from matcher_pipe import MatcherPipe import spacy textExtractor1 = TextExtractor("WWII", "Q362") textExtractor1.extract() textExtractor2 = TextExtractor("London", "Q84") textExtractor2.extract() textExtractor3 = TextExtractor("Paris", "Q90") textExtractor3.extract() textExtractor4 = TextExtractor("World War I", "Q361") textExtractor4.extract() textExtractorPipe = TextExtractorPipe() textExtractorPipe.addTextExtractor(textExtractor1) textExtractorPipe.addTextExtractor(textExtractor2) textExtractorPipe.addTextExtractor(textExtractor3) textExtractorPipe.addTextExtractor(textExtractor4) nlp = spacy.load('en_core_web_sm') nlp.add_pipe(nlp.create_pipe('sentencizer')) # updated doc = nlp(textExtractorPipe.extract()) andOtherPatternMatcher = AndOtherPatternMatcher(nlp) suchAsMatcher = SuchAsPatternMatcher(nlp)
from sty import fg, bg, ef, rs from question_processor import QuestionProcessor from text_extractor import TextExtractor from text_extractor_pipe import TextExtractorPipe from context_retriever import ContextRetriever from answer_retriever import AnswerRetriever from find_keywords import FindKeywords # STEP 1: Extract keywords from the question print(fg.green + "Please enter your question here: " + fg.rs) question = input() getKeywords = FindKeywords(question) key_word = getKeywords.distill() # STEP 2: Download text from wikipedia textExtractor = TextExtractor(key_word, "1") textExtractor.extract() textExtractorPipe = TextExtractorPipe() textExtractorPipe.addTextExtractor(textExtractor) # STEP 3: Retrieve corpus from the text. nlp = spacy.load('en_core_web_sm') nlp.add_pipe('sentencizer') doc = nlp(textExtractorPipe.extract()) sentences = [sent.text.strip() for sent in doc.sents] questionProcessor = QuestionProcessor(nlp) contextRetriever = ContextRetriever(nlp, 3) questionContext = contextRetriever.getContext(sentences, questionProcessor.process(question)) # STEP 4: Retrieve answer from the corpus. answerRetriever = AnswerRetriever()