예제 #1
0
def expand(indir, output_file):
    files = os.listdir(indir)
    uniq_links = set()#many seed urls come from the same site, so there exists duplicated outlinks from seed urls
    out = open(output_file, "w") 
    for f in files:
        if f.split(".")[-1] != "json":
            #make sure this is json file
            continue
        filename = indir + "/" + f
        with open(filename) as lines:
            for line in lines:
                try:
                    data = json.loads(line)
                    url = data['url']
                    url = URLUtility.normalize(url)
                    html_content = data['html'] 
                    #links = HTMLParser.extract_links(url, html_content)
                    links = HTMLParser.extract_links_bs(url, html_content)
                    for link in links:
                        if URLUtility.is_same_site(url, link):
                            if link not in uniq_links:
                                uniq_links.add(link)
                                out.write(link.encode('utf-8') + "\n")
                    if url not in links:
                        out.write(url.encode('utf-8') + "\n")
                except:
                    traceback.print_exc()
                    continue
예제 #2
0
 def test_parseHTMLParser(self):
     html = HTMLParser()
     html.parse('files/Test.html')
     text = [
         'page', 'margin', '2cm', 'p', 'margin', '0', '25cm', 'direct',
         'ltr', 'color', '00000a', 'line', 'height', '115', 'text', 'align',
         'left', 'orphan', '2', 'widow', '2', 'p', 'western', 'font',
         'famili', 'liber', 'serif', 'serif', 'font', 'size', '12pt',
         'languag', 'ru', 'ru', 'p', 'cjk', 'font', 'famili', 'noto', 'san',
         'cjk', 'sc', 'regular', 'font', 'size', '12pt', 'languag', 'zh',
         'cn', 'p', 'ctl', 'font', 'famili', 'lohit', 'devanagari', 'font',
         'size', '12pt', 'languag', 'hi', 'in', 'link', 'languag', 'zxx',
         'i', 'test', 'poop', 'test', 'anim', 'test', 'anim', 'googl',
         'link'
     ]
     assert html.get_processed_stems() == text
def expand(indir, output_file):
    files = os.listdir(indir)
    uniq_links = set(
    )  #many seed urls come from the same site, so there exists duplicated outlinks from seed urls
    out = open(output_file, "w")
    for f in files:
        filename = indir + "/" + f
        with open(filename) as lines:
            for line in lines:
                data = json.loads(line)
                url = data['url']
                url = URLUtility.normalize(url)
                html_content = data['text']
                #links = HTMLParser.extract_links(url, html_content)
                links = HTMLParser.extract_links_bs(url, html_content)
                for link in links:
                    if URLUtility.is_same_site(url, link):
                        if link not in uniq_links:
                            uniq_links.add(link)
                            out.write(link.encode('utf-8') + "\n")
                if url not in links:
                    out.write(url.encode('utf-8') + "\n")

    out.close()
예제 #4
0
 def test_get_linksHTMLParser(self):
     html = HTMLParser()
     html.parse('files/Test.html')
     text = [('google\nlink', 'http://google.com/'),
             ('google\nlink', 'http://google.com/')]
     assert html.get_links() == text
예제 #5
0
class Controller:
    directory = ""
    manager = DBManager()
    parser = HTMLParser()
    normalizer = Normalizer()

    def __init__(self):
        print("Controller init")
        self.directory = "docrepository"

    def main(self):
        print('Options: 1-setup 2-run 3-exit')
        while True:
            text = input("> ")
            if (text == '1'):
                self.setup()
            elif (text == '2'):
                self.displayResults()
            elif (text == '3'):
                break
            else:
                print("Invalid input")

    def setup(self):
        print("Found the following files:")
        p = Path(self.directory)
        for i in p.iterdir():
            print("Working on file:", i.name)
            path = Path.cwd().joinpath(self.directory + "/" + i.name)
            with path.open('r') as file:
                # Parser
                filetext = self.parser.parse(file)
                # Normalizer
                normalized = self.normalizer.normalize(filetext)
                # Save to DB
                if self.manager.saveDoc(i.name) == 1:
                    for term in normalized:
                        if self.manager.saveTerm(term) == 1:
                            relation = {'doc': i.name, 'term': term}
                            self.manager.saveRelation(relation,
                                                      normalized[term])
        self.manager.updateIDF()

    def computeTable(self, queryArray, result, table, method):

        count = 1
        for query in queryArray:
            index = 'Q' + str(count)
            table[index] = []
            aux = result[query]
            for r in aux:
                table['Files'] = sorted(Path(self.directory).iterdir())
                if method == 1:
                    # Recuperar resultado de Producto escalar TF
                    table[index].append(r['scalarTF'])
                elif method == 2:
                    # Recuperar resultado de Producto escalar TF IDF
                    table[index].append(r['scalarTF_IDF'])
                elif method == 3:
                    # Recuperar resultado de Coseno TF
                    table[index].append(r['cosTF'])
                elif method == 4:
                    # Recuperar resultado de Coseno TF IDF
                    table[index].append(r['cosTF_IDF'])
            count = count + 1
        return table

    def displayResults(self):
        queryfile = open('queryfile.txt', 'r')
        queryArray = queryfile.read().splitlines()
        table = OrderedDict()
        table['Files'] = []

        result = OrderedDict()

        # Compute all calculations
        for query in queryArray:
            normalized = self.normalizer.normalize(query)
            result[query] = sorted(search.calcAll(normalized,
                                                  self.manager.docs,
                                                  self.manager.relations,
                                                  self.manager.terms),
                                   key=itemgetter('doc'))

        print("RELEVANCIA: ProductoEscalarTF")
        table = self.computeTable(queryArray, result, table, 1)
        print(tabulate(table, headers="keys"))
        print()

        print("RELEVANCIA: ProductoEscalarTFIDF")
        table = self.computeTable(queryArray, result, table, 2)
        print(tabulate(table, headers="keys"))
        print()

        print("RELEVANCIA: CosenoTF")
        table = self.computeTable(queryArray, result, table, 3)
        print(tabulate(table, headers="keys"))
        print()

        print("RELEVANCIA: CosenoTFIDF")
        table = self.computeTable(queryArray, result, table, 4)
        print(tabulate(table, headers="keys"))
예제 #6
0
def html(link):
    html_parser = HTMLParser()
    html_parser.parse(link)
    word_list = html_parser.get_processed_stems()
    return word_list
예제 #7
0
class Controller:
    directory = ""
    manager = DBManager()
    parser = HTMLParser()
    normalizer = Normalizer()
    searcher = Search()
    metrics = Metrics()

    def __init__(self):
        print("Controller init")
        self.directory = "docrepository"

    def main(self):
        print('Options: 1-setup 2-run 3-run_debug_mode 4-exit d-debug')
        while True:
            text = input("> ")
            if (text == '1'):
                self.setup()
            elif (text == '2'):
                self.displayResults(False)
            elif (text == '3'):
                self.displayResults(True)
            elif (text == '4'):
                break
            elif (text == 'd'):
                self.setupDebug()
            else:
                print("Invalid input")

    def setup(self):
        print("Set up init: ", datetime.datetime.now())
        print("Found the following files:")
        p = Path(self.directory)
        for i in p.iterdir():
            print("Working on file:", i.name)
            if i.name != ".gitkeep":
                path = Path.cwd().joinpath(self.directory + "/" + i.name)
                with path.open('r', encoding='ascii',
                               errors='replace') as file:
                    # Parser
                    filetext = self.parser.parse(file)

                    # Normalizer

                    def timeout_handler(num, stack):
                        raise Exception("File timeout")

                    signal.signal(signal.SIGALRM, timeout_handler)
                    signal.alarm(120)
                    try:
                        normalized = self.normalizer.normalize(filetext)
                        # Save to DB
                        if self.manager.saveDoc(i.name) == 1:
                            for term in normalized:
                                if self.manager.saveTerm(term) == 1:
                                    relation = {'doc': i.name, 'term': term}
                                    self.manager.saveRelation(
                                        relation, normalized[term])
                        self.manager.terms.reindex()
                        self.manager.docs.reindex()
                        self.manager.relations.reindex()
                    except Exception:
                        print("Unable to parse file:", i.name)
                    finally:
                        signal.alarm(0)
        self.manager.updateIDF()
        print("Set up end: ", datetime.datetime.now())

    def setupDebug(self):
        print("Found the following files:")
        p = Path(self.directory)
        number = 0
        for i in p.iterdir():
            if i.name != ".gitkeep":
                print("[" + str(number) + "] Working on file:", i.name)
                number += 1
                path = Path.cwd().joinpath(self.directory + "/" + i.name)
                try:
                    with path.open('r', encoding='ASCII',
                                   errors='replace') as file:
                        # Parser
                        start_time = time.time()
                        filetext = self.parser.parse(file)
                        print("--- %s seconds in parsing ---" %
                              (time.time() - start_time))
                        # Normalizer
                        start_time = time.time()
                        normalized = self.normalizer.normalize(filetext)
                        print("--- %s seconds in normalize ---" %
                              (time.time() - start_time))
                        # Save to DB
                        start_time = time.time()
                        if self.manager.saveDoc(i.name) == 1:
                            for term in normalized:
                                if self.manager.saveTerm(term) == 1:
                                    relation = {'doc': i.name, 'term': term}
                                    self.manager.saveRelation(
                                        relation, normalized[term])
                        self.manager.terms.reindex()
                        self.manager.docs.reindex()
                        self.manager.relations.reindex()
                        print("--- %s seconds in saving ---" %
                              (time.time() - start_time))
                        print(str(len(normalized)) + ' terms saved.')
                        print(
                            str(self.manager.terms.count()) + ' total terms.')
                except UnicodeDecodeError:
                    print('The element is not encoded in ASCII.')
        self.manager.updateIDF()

    def computeTable(self, topicArray, result, table, debug, threshold, limit):
        count = 0
        for topic in topicArray:
            index = topic['id']
            if debug:
                count = count + 1
                if count == limit:
                    break
            table[index] = []
            aux = result[topic['id']]
            for r in aux:
                # Recuperar resultado de Coseno TF IDF
                if r['cosTF_IDF'] and r['cosTF_IDF'] >= threshold:
                    table[index].append(r['doc'].split('.')[0])
        return table

    def displayResults(self, debug):
        limit = 5
        topicArray = []
        if debug:
            print("Running...")
        root = xml.etree.ElementTree.parse('2010-topics.xml').getroot()
        for topic in root.findall('topic'):
            topicArray.append({
                'id': topic.get('id'),
                'query': topic.find('title').text
            })
        table = OrderedDict()

        result = OrderedDict()

        # Compute all calculations
        count = 0
        for topic in topicArray:
            if debug:
                print(topic['query'])
                count = count + 1
                if count == limit:
                    break
            normalized = self.normalizer.normalize(topic['query'])
            queryArray = []

            for term in normalized:
                if debug:
                    print("Synonyms for term: ", term)
                ss = wordnet.synsets(term)
                if not ss:
                    queryArray.append(term)
                if not (not ss):
                    if debug:
                        print(ss[0].lemma_names())
                    for x in ss[0].lemma_names():
                        #Añadir términos relevantes a la consulta
                        queryArray.append(x)
            if debug:
                print("Extended Query>>>>>", queryArray)

            result[topic['id']] = sorted(self.searcher.calcAll(
                queryArray, self.manager.docs, self.manager.relations,
                self.manager.terms),
                                         key=itemgetter('doc'))

        threshold = 0.09
        table['Metricas'] = [
            'numDocs', 'recall10', 'recall5', 'precision10', 'precision5',
            'fvalue10', 'fvalue5', 'rrank1', 'rrank2', 'aprecision', 'nDCG10'
        ]
        table = self.computeTable(topicArray, result, table, debug, threshold,
                                  limit)

        count = 0
        for topic in topicArray:
            if debug:
                count = count + 1
                if count == limit:
                    break
            files = table[topic['id']]
            print(topic['id'])

            table[topic['id']] = []

            table[topic['id']].append(len(files))

            recall = self.metrics.cutRecall(sorted(files), topic['id'])
            table[topic['id']].append(recall['recall10'])
            table[topic['id']].append(recall['recall5'])

            # tests the cutPrecision
            precision = self.metrics.cutPrecision(sorted(files), topic['id'])
            table[topic['id']].append(precision['precision10'])
            table[topic['id']].append(precision['precision5'])

            # tests the FMeasure
            FMeasure = self.metrics.FMeasure(precision, recall)
            table[topic['id']].append(FMeasure['fvalue10'])
            table[topic['id']].append(FMeasure['fvalue5'])

            # tests the RRank1
            RRank1 = self.metrics.RRank1(files, topic['id'])
            table[topic['id']].append(RRank1['rrank1'])

            # tests the RRank2
            RRank2 = self.metrics.RRank2(files, topic['id'])
            table[topic['id']].append(RRank2['rrank2'])

            # tests the Aprecision
            Aprecision = self.metrics.APrecision(files, topic['id'])
            table[topic['id']].append(Aprecision['aprecision'])

            # tests the nDCG
            nDCG = self.metrics.nDCG(files, topic['id'], 10)
            table[topic['id']].append(nDCG['nDCG10'])
        '''
        table['Metricas'] = ['numDocs','recall10', 'recall5','precision10', 'precision5','fvalue10','fvalue5', 'rrank1', 'rrank2', 'aprecision', 'nDCG10']
        table['2010-001'] = [20, 0.8, 0.9,0.5833, 0.3333, 0.6363, 0.470588, 0.0769, 0.1111, 1.125, [0.5, 0.6666, 0.75, 0.6478, 0.6821, 0.6293, 0.6988, 0.69887, 0.81874]]
        table['2010-002'] = [10, 0.8, 0.6,0.5833, 0.3333, 0.6363, 0.470588, 0.0769, 0.1111, 1.125, [0.5, 0.6666, 0.75, 0.6478, 0.6821, 0.6293, 0.6988, 0.69887, 0.81874]]
        table['2010-003'] = [15, 0.8, 0.3,0.5833, 0.3333, 0.6363, 0.470588, 0.0769, 0.1111, 1.125, [0.5, 0.6666, 0.75, 0.6478, 0.6821, 0.6293, 0.6988, 0.69887, 0.81874]]
        #table['2010-004'] = [20, 0.8, 0.7,0.5833, 0.3333, 0.6363, 0.470588, 0.0769, 0.1111, 1.125, [0.5, 0.6666, 0.75, 0.6478, 0.6821, 0.6293, 0.6988, 0.69887, 0.81874]]
        #table['2010-005'] = [20, 0.8, 0.7,0.5833, 0.3333, 0.6363, 0.470588, 0.0769, 0.1111, 1.125, [0.5, 0.6666, 0.75, 0.6478, 0.6821, 0.6293, 0.6988, 0.69887, 0.81874]]
        '''
        #Calcular las medias
        avgDocs, avgR10, avgR5, avgP10, avgP5, avgF10, avgF5, avgRR1, avgRR2, avgAP = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        for topic in table:
            if topic != 'Metricas':
                avgDocs += table[topic][0]
                avgR10 += table[topic][1]
                avgR5 += table[topic][2]
                avgP10 += table[topic][3]
                avgP5 += table[topic][4]
                avgF10 += table[topic][5]
                avgF5 += table[topic][6]
                avgRR1 += table[topic][7]
                avgRR2 += table[topic][8]
                avgAP += table[topic][9]

        size = len(table) - 1
        table['Medias'] = []
        table['Medias'].append(avgDocs / size)
        table['Medias'].append(avgR10 / size)
        table['Medias'].append(avgR5 / size)
        table['Medias'].append(avgP10 / size)
        table['Medias'].append(avgP5 / size)
        table['Medias'].append(avgF10 / size)
        table['Medias'].append(avgF5 / size)
        table['Medias'].append(avgRR1 / size)
        table['Medias'].append(avgRR2 / size)
        table['Medias'].append(avgAP / size)
        print(tabulate(table, headers="keys"))
예제 #8
0
from htmlparser import HTMLParser

offset = 0
increment = 100
total = 232596
parsing = True

while parsing:
    if offset > total:
        offset = total
    print('Parsing records ' + str(offset) + ' to ' + str(offset + increment))
    toparse = HTMLParser('crawler.db', offset, increment)
    toparse.parse_data()
    if offset == total:
        parsing = False
    offset = offset + increment

#toparse.len_of_data()