예제 #1
0
class Wind:
    def __init__(self):
        self.name_getter = ThuItemGetter()
        self.location_getter = ThuLocationGetter()
        self.item_getter = ThuItemGetter()
        self.relation_getter = None
        self.doc_reader = DocReader()

    def process(self, dir_name='data'):
        if os.path.exists(dir_name) and os.path.isdir(dir_name):
            for file_name in os.listdir(dir_name):
                relative_file_name = dir_name + '/' + file_name
                self.doc_reader.load(relative_file_name)
                names = set()
                locations = set()
                items = set()
                relations = set()
                params = []
                for param in self.doc_reader.process():
                    params.append(param)
                    names = names.union(self.name_getter.process(param))
                    locations = locations.union(
                        self.location_getter.process(param))
                    items = items.union(self.item_getter.process(param))
                    # relations = relations.union(self.relation_getter.process(params, names))

                yield (file_name, names, locations, items, relations)
예제 #2
0
    def web_doc_similarity(self):
        start = time.time()
        df = pd.read_csv(DOC_TEXTS)
        df2 = pd.read_csv(SCRAPE_RESULTS_CSV)

        dr = DocReader()
        #Convert to Synsets
        webSynSets = []
        docSynSets = []
        for r, t in zip(df2.content.tolist(), df.ParaText.tolist()):
            webSynSets.append(dr.str_to_synsets(str(r)))
            docSynSets.append(dr.str_to_synsets(str(t)))

        result_scores = 0.0
        indiv_scores = []
        simi_content = []
        simi_links = []

        for ss1 in docSynSets:
            docsets = ss1
            best_score = 0.0
            best_content = ""
            best_link = ""
            ss2_num = 0
            it = 0
            for ss2 in webSynSets:
                websets = ss2
                simi = dr.quick_compare(docsets, websets)
                print(simi)
                if (simi > 0.01):
                    if (simi > best_score):
                        best_score = simi
                        best_content = df2.loc[it, 'content']
                        best_link = df2.loc[it, 'url']
                        ss2_num = it
                it += 1
            if best_score > 0.0:
                del webSynSets[ss2_num]

            indiv_scores.append(best_score)
            result_scores += best_score

            simi_links.append(best_link)
            simi_content.append(best_content)

        end = time.time()
        print(end - start)
        try:
            return indiv_scores, simi_links, simi_content
        except:
            return 0.0, 0.0, "None Found", "None Found"
예제 #3
0
class DocReaderTest(unittest.TestCase):
    def setUp(self):
        self.aq_root = '/dropbox/17-18/573/AQUAINT'
        self.aq2_root = '/dropbox/17-18/573/AQUAINT-2'
        self.eng_gw_root = '/dropbox/17-18/573/ENG-GW'
        self.doc_reader = DocReader(self.aq_root, self.aq2_root,
                                    self.eng_gw_root)

    def test_read_docs(self):
        # Read all docs from training test set
        input_xml_filename = 'src/test_data/aquaint_topics.xml'
        parsed = self.doc_reader.read_docs(input_xml_filename)
        print(json.dumps(parsed))

    def test_resolve_path(self):
        test_data = [
            ('XIE19990529.0166', (self.aq_root + '/xie/1999/19990529_XIN_ENG',
                                  'AQUAINT')),
            ('APW19990421.0284', (self.aq_root + '/apw/1999/19990421_APW_ENG',
                                  'AQUAINT')),
            ('NYT19990421.0284', (self.aq_root + '/nyt/1999/19990421_NYT',
                                  'AQUAINT')),
            ('XIN_ENG_20050415.0040',
             (self.aq2_root + '/data/xin_eng/xin_eng_200504.xml',
              'AQUAINT-2')),
            ('APW_ENG_20061002.1245',
             (self.eng_gw_root + '/data/apw_eng/apw_eng_200610.gz', 'ENG-GW')),
            ('APW_ENG_20061003.0134',
             (self.eng_gw_root + '/data/apw_eng/apw_eng_200610.gz', 'ENG-GW')),
        ]
        for doc_id, path in test_data:
            with self.subTest():
                self.assertEqual(path, self.doc_reader.resolve_path(doc_id))

        # Test invalid doc_id
        with self.assertRaises(ValueError) as cm:
            self.doc_reader.resolve_path('foo')

    def test_parse_doc(self):
        test_data = [
            ('/corpora/LDC/LDC02T31/apw/1998/19980601_APW_ENG', 'AQUAINT',
             'APW19980601.0007'),
            ('/corpora/LDC/LDC08T25/data/apw_eng/apw_eng_200601.xml',
             'AQUAINT-2', 'APW_ENG_20060101.0027'),
            ('/corpora/LDC/LDC11T07/data/apw_eng/apw_eng_200610.gz', 'ENG-GW',
             'APW_ENG_20061003.0134'),
        ]
        for path, format_name, doc_id in test_data:
            content = self.doc_reader.parse_doc(path, format_name, doc_id)
            print(content)

    def test_read(self):
        #input_xml_filename = '/dropbox/17-18/573/Data/Documents/devtest/GuidedSumm10_test_topics.xml'
        test_data_dir = os.path.dirname(
            os.path.realpath(__file__)) + '/test_data'
        input_xml_filename = test_data_dir + '/test_topics.xml'
        topics_data = self.doc_reader.read_docs(input_xml_filename)
        print(topics_data)
     generate_data(dataset_folder=dataset_folder)
     

# Read corpus
  with open(data_file, 'rb') as f:
     frame = pd.read_csv(f)

#Document retriver based on BM250    
  retriever = DocRetriever(top_n=10)

#Vectorize BM250 using the corpus
  retriever.fit_retriever(frame)

#Reader based on Hugging Face BertForQuestionAnswering Transformer
#reader = DocReader('./model/')
  reader = DocReader('bert-large-uncased-whole-word-masking-finetuned-squad')

# Find top_n documents based on BM250 for the query 

  query = input("Enter the query (type exit to quit) : ")
  while(query != 'exit' and query != 'Exit'):
     print("Processing...")
     doc_scores = retriever.compute_scores(query)
   
#Select top_n documents
     index = [score[0] for score in doc_scores]
     text = frame.loc[index]

#predict n_best answers using BERT
     answers = reader.predict(df=text, query=query, n_best=5)
예제 #5
0
파일: main.py 프로젝트: austin226/ling573
    init_dirs()

    input_xml_filename = sys.argv[1]
    output_base_dir = sys.argv[2]
    core_nlp_port = int(sys.argv[3])

    print(
        "Starting summarizer with input_xml_filename='{}', output_base_dir='{}', core_nlp_port='{}'"
        .format(input_xml_filename, output_base_dir, core_nlp_port))

    os.makedirs(output_base_dir, exist_ok=True)

    # Initialize document reader with AQUAINT, AQUAINT-2, and ENG-GW root paths
    doc_reader = DocReader('/dropbox/17-18/573/AQUAINT',
                           '/dropbox/17-18/573/AQUAINT-2',
                           '/dropbox/17-18/573/ENG-GW')
    summarizer = build_summarizer(core_nlp_port)

    print('Reading in documents from "{}"...'.format(input_xml_filename))
    topics_data = doc_reader.read_docs(input_xml_filename)['topics']
    for i, topic in enumerate(topics_data):
        topic_id = topic['id']
        topic_title = topic['title']
        topic_category = topic['category']
        docset = topic['docset']

        print(
            'Summarizing topic "{}" (topic {} of {}, {} documents)...'.format(
                topic_title, i + 1, len(topics_data), len(docset)))
예제 #6
0
 def __init__(self):
     self.name_getter = ThuItemGetter()
     self.location_getter = ThuLocationGetter()
     self.item_getter = ThuItemGetter()
     self.relation_getter = None
     self.doc_reader = DocReader()
예제 #7
0
 def setUp(self):
     self.aq_root = '/dropbox/17-18/573/AQUAINT'
     self.aq2_root = '/dropbox/17-18/573/AQUAINT-2'
     self.eng_gw_root = '/dropbox/17-18/573/ENG-GW'
     self.doc_reader = DocReader(self.aq_root, self.aq2_root,
                                 self.eng_gw_root)
예제 #8
0
class DocuBot():

    THRESH = 0.0

    doc = ""
    doc_local = ""
    doc_reader = DocReader()

    def __init__(self):
        self.THRESH = 0.5

    #SET FUNCTIONS
    def set_file_path(self, filename, option):
        if(filename.lower().endswith('.docx') and os.path.isfile(filename)):
            if(option == 1):
                self.doc = filename

                if self.doc is not None:
                    print("File uploaded successfully")
                    return True

            if(option == 2):
                self.doc_local = filename
                if self.doc_local is not None:
                    print("File 2 uploaded successfully")
                    return True
        else:
            print("Error: File type not supported please make sure that the file is a .docx file")
            return False

    def set_thresh(self, x):
        sens = [0, 0.1, 0.5, 0.70]
        if((x > 0) and (x < 4)):
            self.THRESH = sens[x]

    #GET FUNCTIONS
    def get_thresh(self):
        return self.THRESH

    #RUN ANALYZE
    def merge_duplicates(self, data):
        #FIX THIS MERGE PART
        try:
            dataset = data.to_frame()
        except:
            dataset = data
        merged_dataset = dataset.groupby(['simi_links', 'simi_content'])['indiv_scores'].mean().reset_index()
        return merged_dataset

    def analyze_simi_online(self):

        if(os.path.exists(SIMI_RESULT)):
            os.remove(SIMI_RESULT)

        start = time.time()
        filename = self.doc
        se = SearchEngine()
        se.get_simi_link(filename)
        sp = ScrapeProcessor()
        df = sp.check_simi()

        clean_df = self.merge_duplicates(df)
        indiv_scores = clean_df['indiv_scores'].astype(str).astype(float)

        scoreList = []
        for iscore in indiv_scores:
            scoreList.append(iscore)

        total_score = 0.0
        url_list = []
        for link, score in zip(clean_df['simi_links'].astype(str), scoreList):
            if(score > self.THRESH):
                total_score += score
                url_list.append(link)

        clean_df.to_csv(SIMI_RESULT)

        try:
            output = (total_score / len(url_list)) * 100
            return output, url_list

        except:
            return 0.0, ["None Found"]

        end = time.time()
        print("Time taken to analyze: {}".format(end-start))

    def analyze_simi_local(self, d1, d2):
        return self.doc_reader.doc_similarity(d1,d2)