def __init__(self,
                 folder='data/mmimdb-256/dataset-resized-256max',
                 split='dev',
                 image_transform=None):
        self.json_dir = os.path.join(folder, split, 'metadata')
        self.image_dir = os.path.join(folder, split, 'images')
        self.image_transform = image_transform
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.text_extractor = TextExtractor(
            folder + "/" + split + "/images/",
            split + "_" + "dataset_text_extract_output.txt", split)
        #insantiate a model to extract text

        # Category definitions of movies.
        self.categories = [
            'Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
            'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
            'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News',
            'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show',
            'Thriller', 'War', 'Western'
        ]
        self.categories2ids = {
            category: id
            for (id, category) in enumerate(self.categories)
        }

        # Load JSON files.
        #print('Loading %s ...' % self.json_dir, end = '')
        print("extracting text and getting metadata")
        self.fdir = os.listdir(self.json_dir)
        self.metadata = [(fname[:-5],
                          json.load(open(os.path.join(self.json_dir, fname))))
                         for fname in sorted(self.fdir)
                         if not fname.startswith('.')]
        print(len(self.metadata))
        self.text_extractor.extract_text()

        print(' finished')

        # Pre-tokenizing all sentences.

        print('Tokenizing...', end='')
        self.tokenized_plots = list()
        for i in range(0, len(self.metadata)):
            text = self.text_extractor.get_item(
                i)  #self.metadata[i][1]['plot'][0]
            encoded_text = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                truncation=True,
                max_length=256,
                padding='max_length',
                return_attention_mask=True,
                return_tensors='pt')
            self.tokenized_plots.append(encoded_text)
        print(' finished')
Exemplo n.º 2
0
 def __init__(self, domain, api_user, api_token, blacklist_file,
              max_attachment_size, cache_location,
              start_date: datetime.date):
     self._cache_location = cache_location
     self._start_date = start_date
     self._domain = domain
     self._text_extractor = TextExtractor()
     self._repository = ConfluenceRepository(
         domain, api_user, api_token, max_attachment_size,
         self._text_extractor.supported_mime_types)
     self._secret_finder = SecretFinder(blacklist_file)
Exemplo n.º 3
0
    def test_non_ocr_pdf(self) :
        """
            Access an valid pdf which hasn't been OCR'd
        """
        file_name = 'non_ocr_file.pdf'

        text_extractor = TextExtractor(
            source_file = file_name,
            source_directory= TextExtractorTest.test_directory,
            working_directory = '/tmp',
            testing = True)

        actual_results = text_extractor.get_file_contents_as_array()

        self.assertEquals(len(actual_results), 0)
Exemplo n.º 4
0
    def test_valid_pdf(self) :
        """
            Access an empty file with .pdf suffix
            This could well break when we test the file type properly
        """

        expected_results = [
            'Test 1\n',
            'Test 2\n',
            '\n'
        ]
        file_name = 'test_file1.pdf'

        text_extractor = TextExtractor(
            source_file = file_name,
            source_directory= TextExtractorTest.test_directory,
            working_directory = '/tmp',
            testing = True)

        actual_results = text_extractor.get_file_contents_as_array()

        self.assertEquals(expected_results, actual_results)
class TestTextExtractor(unittest.TestCase):
    def setUp(self):
        self.text_extractor = TextExtractor(
            "https://scraping-for-beginner.herokuapp.com/login_page")
        self.text_extractor.login("imanishi", "kohei")

    def test_get_lecturer_info(self):
        profile, *_ = self.text_extractor.get_lecturer_info()
        self.assertEqual(
            {
                "講師名": "今西 航平",
                "所属企業": "株式会社キカガク",
                "生年月日": "1994年7月15日",
                "出身": "千葉県",
                "趣味": "バスケットボール、読書、ガジェット集め"
            }, profile)

    def test_export_csv(self):
        _, keys, vals = self.text_extractor.get_lecturer_info()
        self.text_extractor.export_csv(keys, vals, "../csv/lecturer_info.csv")
        self.assertEqual(True, path.exists("../csv/lecturer_info.csv"))
Exemplo n.º 6
0
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
import json
from text_extractor import TextExtractor
from document_analyzer import DocumentAnalyzer
from document_indexer import DocumentIndexer

document_indexer = DocumentIndexer()
document_analyzer = DocumentAnalyzer()
text_extractor = TextExtractor()


def handler(event, context):
    message = json.loads(event['Records'][0]['Sns']['Message'])

    jobId = message['JobId']
    print("JobId=" + jobId)

    status = message['Status']
    print("Status=" + status)

    if status != "SUCCEEDED":
        return {
            # TODO : handle error with Dead letter queue (not in this workshop)
            # https://docs.aws.amazon.com/lambda/latest/dg/dlq.html
 def setUp(self):
     self.text_extractor = TextExtractor(
         "https://scraping-for-beginner.herokuapp.com/login_page")
     self.text_extractor.login("imanishi", "kohei")
Exemplo n.º 8
0
from text_extractor import TextExtractor
from and_other_pattern_matcher import AndOtherPatternMatcher
from such_as_pattern_matcher import SuchAsPatternMatcher
from or_other_pattern_matcher import OrOtherPatternMatcher
from including_pattern_matcher import IncludingPatternMatcher
from especially_pattern_matcher import EspeciallyPatternMatcher
from text_extractor_pipe import TextExtractorPipe
from knowledge_graph import KnowledgeGraph
from matcher_pipe import MatcherPipe
import spacy

textExtractor1 = TextExtractor("WWII", "Q362")
textExtractor1.extract()
textExtractor2 = TextExtractor("London", "Q84")
textExtractor2.extract()
textExtractor3 = TextExtractor("Paris", "Q90")
textExtractor3.extract()
textExtractor4 = TextExtractor("World War I", "Q361")
textExtractor4.extract()
textExtractorPipe = TextExtractorPipe()
textExtractorPipe.addTextExtractor(textExtractor1)
textExtractorPipe.addTextExtractor(textExtractor2)
textExtractorPipe.addTextExtractor(textExtractor3)
textExtractorPipe.addTextExtractor(textExtractor4)

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe(nlp.create_pipe('sentencizer'))  # updated
doc = nlp(textExtractorPipe.extract())

andOtherPatternMatcher = AndOtherPatternMatcher(nlp)
suchAsMatcher = SuchAsPatternMatcher(nlp)
class MovieDataset(torch.utils.data.Dataset):
    def __init__(self,
                 folder='data/mmimdb-256/dataset-resized-256max',
                 split='dev',
                 image_transform=None):
        self.json_dir = os.path.join(folder, split, 'metadata')
        self.image_dir = os.path.join(folder, split, 'images')
        self.image_transform = image_transform
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.text_extractor = TextExtractor(
            folder + "/" + split + "/images/",
            split + "_" + "dataset_text_extract_output.txt", split)
        #insantiate a model to extract text

        # Category definitions of movies.
        self.categories = [
            'Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
            'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
            'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News',
            'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show',
            'Thriller', 'War', 'Western'
        ]
        self.categories2ids = {
            category: id
            for (id, category) in enumerate(self.categories)
        }

        # Load JSON files.
        #print('Loading %s ...' % self.json_dir, end = '')
        print("extracting text and getting metadata")
        self.fdir = os.listdir(self.json_dir)
        self.metadata = [(fname[:-5],
                          json.load(open(os.path.join(self.json_dir, fname))))
                         for fname in sorted(self.fdir)
                         if not fname.startswith('.')]
        print(len(self.metadata))
        self.text_extractor.extract_text()

        print(' finished')

        # Pre-tokenizing all sentences.

        print('Tokenizing...', end='')
        self.tokenized_plots = list()
        for i in range(0, len(self.metadata)):
            text = self.text_extractor.get_item(
                i)  #self.metadata[i][1]['plot'][0]
            encoded_text = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                truncation=True,
                max_length=256,
                padding='max_length',
                return_attention_mask=True,
                return_tensors='pt')
            self.tokenized_plots.append(encoded_text)
        print(' finished')

    def __getitem__(self, index: int):
        # Load images on the fly.
        filename, movie_data = self.metadata[index]
        img_path = os.path.join(self.image_dir, filename + '.jpeg')
        image = Image.open(img_path).convert('RGB')
        #TODO: ADD cacheing
        text = self.tokenized_plots[index]['input_ids'][0]
        text_mask = self.tokenized_plots[index]['attention_mask'][0]
        genres = movie_data['genres']

        if self.image_transform: image = self.image_transform(image)

        # Encode labels in a binary vector.
        label_vector = torch.zeros((len(self.categories)))
        label_ids = [self.categories2ids[cat] for cat in genres]
        label_vector[label_ids] = 1

        return image, text, text_mask, label_vector

    def load_image_only(self, index: int):
        filename, movie_data = self.metadata[index]
        img_path = os.path.join(self.image_dir, filename + '.jpeg')
        image = Image.open(img_path).convert('RGB')
        return image

    def __len__(self):
        return len(self.metadata)
Exemplo n.º 10
0
class App(object):
    def __init__(self, domain, api_user, api_token, blacklist_file,
                 max_attachment_size, cache_location,
                 start_date: datetime.date):
        self._cache_location = cache_location
        self._start_date = start_date
        self._domain = domain
        self._text_extractor = TextExtractor()
        self._repository = ConfluenceRepository(
            domain, api_user, api_token, max_attachment_size,
            self._text_extractor.supported_mime_types)
        self._secret_finder = SecretFinder(blacklist_file)

    def __enter__(self):
        if self._cache_location:
            cache_path = self._cache_location
        else:
            current_folder = os.path.dirname(os.path.realpath(__file__))
            cache_path = os.path.join(current_folder, "cache.sqlite")
        self._cache = Cache(cache_path, self._domain)
        return self

    def __exit__(self, *args):
        self._cache.close()

    def get_secrets_from_versions(self, content,
                                  start_version) -> Iterable[VersionSecrets]:
        for version in self._repository.get_versions(content):
            if version.id <= start_version:
                continue

            version_content = self._text_extractor.extract_text_from_version(
                content, version)
            secrets = set()
            for secret in self._secret_finder.find_secrets(version_content):
                secrets.add(secret)

            if any(secrets):
                yield VersionSecrets(content, version, secrets)

    def find_secrets_from_date(self, date) -> Iterable[VersionSecrets]:
        today = datetime.datetime.now().date()
        while date <= today:
            logging.info(f"Fetching changes for {date}...")
            for content in self._repository.get_content_for_date(date):
                crawl_history = self._cache.get_crawl_history(content.id)
                if crawl_history:
                    new_version_secrets = []
                    if crawl_history.latest_version != content.latest_version:
                        logging.info(
                            f"Fetching versions {crawl_history.latest_version}-{content.latest_version} from {content}..."
                        )
                        new_version_secrets = list(
                            self.get_secrets_from_versions(
                                content, crawl_history.latest_version))
                else:
                    logging.info(
                        f"Fetching {content.latest_version} versions from {content}..."
                    )
                    new_version_secrets = list(
                        self.get_secrets_from_versions(content, 0))
                    crawl_history = ContentCrawlHistory()

                for version_secrets in new_version_secrets:
                    version_secrets.secrets = [
                        s for s in version_secrets.secrets
                        if s not in crawl_history.secrets
                    ]
                    crawl_history.secrets.extend(version_secrets.secrets)

                crawl_history.latest_version = content.latest_version
                self._cache.set_crawl_history(content.id, crawl_history)
                for s in new_version_secrets:
                    if any(s.secrets):
                        yield s

            self._cache.set_last_crawl_date(date)
            date += datetime.timedelta(days=1)

    def find_secrets(self) -> Iterable[VersionSecrets]:
        for s in self.find_secrets_from_date(self._get_start_date()):
            yield s

    def _get_start_date(self) -> datetime.date:
        if self._start_date:
            return self._start_date
        cached_date = self._cache.get_last_crawl_date()
        if cached_date:
            return cached_date
        return self._repository.get_oldest_content_creation_date()
Exemplo n.º 11
0
from sty import fg, bg, ef, rs
from question_processor import QuestionProcessor
from text_extractor import TextExtractor
from text_extractor_pipe import TextExtractorPipe
from context_retriever import ContextRetriever
from answer_retriever import AnswerRetriever
from find_keywords import FindKeywords

# STEP 1: Extract keywords from the question
print(fg.green + "Please enter your question here: " + fg.rs)
question = input()
getKeywords = FindKeywords(question)
key_word = getKeywords.distill()

# STEP 2: Download text from wikipedia
textExtractor = TextExtractor(key_word, "1")
textExtractor.extract()
textExtractorPipe = TextExtractorPipe()
textExtractorPipe.addTextExtractor(textExtractor)

# STEP 3: Retrieve corpus from the text.
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('sentencizer')
doc = nlp(textExtractorPipe.extract())
sentences = [sent.text.strip() for sent in doc.sents]
questionProcessor = QuestionProcessor(nlp)
contextRetriever = ContextRetriever(nlp, 3)
questionContext = contextRetriever.getContext(sentences, questionProcessor.process(question))

# STEP 4: Retrieve answer from the corpus.
answerRetriever = AnswerRetriever()
Exemplo n.º 12
0
 def can_analyze(file_name):
     ext = TextExtractor.get_extension(file_name) 
     return ext.lower() in [ '.html', '.htm' ]
Exemplo n.º 13
0
def test_file_extension_no_extension():
    assert '' == TextExtractor.get_extension('dolphin')
Exemplo n.º 14
0
def test_file_extension():
    assert '.png' == TextExtractor.get_extension('dolphin.png')