示例#1
0
 def __init__(self):
     self.parameter = list()
     self.total = list()
     self.paramCount = 0
     self.type = 'string'
     self.actionIndexer = Indexer(self.type)
     self.featureIndexer = Indexer(self.type)
    def processFile(self):
        interpreter = Interpreter()

        # return all the documents present in the file
        output = self.path + '.bin'
        if isfile(output):
            print('loading tokens')
            self.index = pickle.load(open(output, 'rb'))
            self.indexer = Indexer(self.tokenizer, index=self.index)
        else:
            self.indexer = Indexer(self.tokenizer)
            file = open(self.path, 'r', encoding='utf-8', errors='ignore')
            maximum = os.stat(self.path).st_size
            # initialize the variables
            i = 0
            progress = 0
            document = []
            for line in file:
                progress += len(line)
                if line == '\n':
                    interpreter.process(self.indexer, document)
                    document = []
                else:
                    document += [line]
                i += 1
                if i >= 5000:
                    i = 0
                    log(progress, maximum)

            file.close()
            self.index = self.indexer.index
            print('\nsaving tokens')
            pickle.dump(self.index, open(output, 'wb'))
示例#3
0
def start_indexing(dirs_list, dirs_dicts, main_path, posting_path, to_stem,
                   start_index, end_index, directory):
    dirs_dicts[directory] = None
    reader = ReadFile()
    parser = Parse(main_path)
    indexer = Indexer(posting_path + directory)

    if to_stem:
        parser.to_stem = True
        indexer.to_stem = True
    if not os.path.exists(posting_path + directory):
        os.makedirs(posting_path + directory)

    documents = {}
    i = start_index
    while i < end_index:
        docs = reader.separate_docs_in_file(main_path + '\\corpus',
                                            dirs_list[i])
        j = 0
        for doc_id in docs:
            doc_dict = parser.main_parser(docs[doc_id].text, docs[doc_id])
            docs[doc_id].text = None
            if i == end_index - 1 and j == len(docs) - 1:
                indexer.finished_parse = True
            indexer.index_terms(doc_dict, doc_id)
            documents[doc_id] = docs[doc_id]
            j += 1
        i += 1
    dirs_dicts[directory] = [
        indexer.post_files_lines, indexer.terms_dict, documents,
        reader.languages
    ]
示例#4
0
 def __init__(self, device):
     super(PostagEmbedding, self).__init__(device=device)
     self.indexer = Indexer(
         special_tokens={
             '<s>': 0,
             '<unk>': 1,
             '<pad>': 2,
             '<\s>': 3,
             '<mask>': 4
         },
         with_del_stopwords=False)  # postag embedding の場合だけ必ずFalse
     datasets = Dataset().get_instance()
     sentences = [
         nltk.pos_tag(self.indexer.tokenize(pairs[0]))
         for pairs in datasets['train']
     ]
     sentences = [[pairs[1] for pairs in sentence]
                  for sentence in sentences]
     for sentence in sentences:
         self.indexer.add_sentence(sentence, with_raw=True)
     self.embedding_dim = 10
     self.embedding = nn.Embedding(num_embeddings=len(self.indexer),
                                   embedding_dim=self.embedding_dim,
                                   padding_idx=self.indexer.padding_index)
     self.embedding.to(device)
def indexDirs():
    click.echo("Indexing Files....")
    libraries = jhandler.getLibs()
    indxer = Indexer(libraries)
    nols = indxer.index()
    click.echo("{} library items detected!".format(nols))
    click.echo("Done Indexing Files....")
示例#6
0
def main():
    parser = util.get_dates_range_parser()
    args = parser.parse_args()

    ind = Indexer(DB_DIR)

    #prof1, nouns1 = get_profiles(ind, args.start)
    #prof2, nouns2 = get_profiles(ind, args.end)
   
    cur = ind.get_db_for_date(args.start)

    prof, nouns = get_profiles(ind, args.start)
    
    replys_v = set()
    for p in prof:
        replys_v |= set(prof[p].replys.keys())

    m = []
    for p in prof:
        m_i = []
        for r in replys_v:
            if r in prof[p].replys:
                m_i.append(prof[p].replys[r])
            else:
                m_i.append(0)
        m.append(m_i)

    logging.info("%s x %s" % (len(m), len(m[0])))

    u, s, v = numpy.linalg.svd(m, full_matrices=False)

    k = 50

    uk = numpy.transpose(numpy.transpose(u)[:k])
    sk = s[:k]

    stats.create_given_tables(cur, ["noun_similarity"])
    cur.execute("create table if not exists noun_sim_svd as select * from noun_similarity limit 0")

    p_keys = prof.keys()

    sims = []
    for i in range(0, len(p_keys)):
        for j in range(i + 1, len(p_keys)):
            
            p1_ = map(lambda x: u[i][x] * sk[x] , range(0, k))
            p2_ = map(lambda x: u[j][x] * sk[x] , range(0, k))

            sim = numpy.dot(p1_, p2_) / (numpy.linalg.norm(p1_) * numpy.linalg.norm(p2_))
            sims.append((p_keys[i], p_keys[j], sim))
 
            if len(sims) > 20000:
                save_sims(cur, sims)
                sims = []

                logging.info("Another 10k seen")

    save_sims(cur, sims)

    logging.info("done")
def interact(c):
    # Create a indexer object
    indexer = Indexer(root)
    while True:
        #  Get Directory details
        curr_directory, folders, files = indexer.get_dir_details()
        print "Current Directory :", curr_directory
        #  Send the details to the client

        c.send(curr_directory + "\n")
        for item in folders:
            c.send(item + '\n')
        c.send("\n")
        for item in files:
            c.send(item[0] + '\n' + str(item[1]) + "\n")
        c.send("/")
        # Receive response from client
        choice = c.recv(1024)

        # Is -1 disconnect
        if choice == "-1":
            print "Disconnecting from Client"
            return
        elif int(choice) <= len(
                folders):  # Change directory if folder is selected
            indexer.make_choice(int(choice))
        else:
            send_file(c, indexer.get_file_path(
                int(choice)))  #Send file if file is selected
    return
示例#8
0
def main():
    # Indexer Initialization
    indexer = Indexer()
    indexer.build_dictionary()
    # indexer.write_dict_to_file()

    # classify files
    run_classifier(indexer)
示例#9
0
 def __init__(self):
     self._max_url_length = 100
     self._url_list = []
     self._title_list = []
     self._max_stay_on_site = 100
     self._current_on_site = 0
     self._previous_domain = None
     self._max_urls_in_list = 500
     self._max_new_urls_per_page = 100
     self._aggressive_pruning = True
     self._indexer = Indexer("localhost", 9200)
示例#10
0
def main():
    parser = util.get_dates_range_parser()
    args = parser.parse_args()

    logging.info("Start")
    ind = Indexer(DB_DIR)

    grades = (1, 10, 100, 1000)
    data = [["date", "nouns", "tweets", "tweet_chains"] +
            map(lambda x: "cnt > %s" % x, grades)]
    print data

    dates = []
    for date in sorted(ind.dates_dbs.keys()):
        if args.start is not None and date < args.start:
            continue
        if args.end is not None and date > args.end:
            continue
        cur = ind.get_db_for_date(date)

        tables = cur.execute(
            "SELECT name FROM sqlite_master WHERE type='table' and name = 'tweets_nouns'"
        ).fetchall()
        if len(tables) == 0:
            logging.error("No tweets_nouns for date %s" % date)
            continue

        stats.create_given_tables(cur, ["post_cnt"])
        post_cnt = cur.execute("select count(*) from post_cnt").fetchone()[0]
        if post_cnt == 0:
            cur.execute(
                "insert or ignore into post_cnt select noun_md5, count(*) from tweets_nouns group by noun_md5"
            )

        cnt = [date]
        nouns_cnt = cur.execute("select count(*) from nouns").fetchone()[0]
        cnt.append(nouns_cnt)
        tweets = cur.execute("select count(*) from tweets").fetchone()[0]
        cnt.append(tweets if tweets is not None else "~")
        tweet_chains = cur.execute(
            "select count(*) from tweet_chains").fetchone()[0]
        cnt.append(tweet_chains if tweet_chains is not None else "~")

        for i in grades:
            cnti = cur.execute(
                "select count(*) from (select 1 from post_cnt where post_cnt > %s group by post_md5)"
                % i).fetchone()[0]
            cnt.append("%.2f" % ((cnti + 0.0) / nouns_cnt))

        data.append(cnt)

    for row in data:
        print_cols(row)
示例#11
0
 def load(self):
     self.indexer = Indexer(self.posting_path)
     if self.to_stem:
         self.indexer.to_stem = True
     self.languages = self.indexer.load()
     self.avg_doc_length = self.indexer.docs_avg_length
     self.searcher = Searcher(self.main_path, self.posting_path,
                              self.indexer.terms_dict,
                              self.indexer.cities_dict,
                              self.indexer.docs_dict, self.avg_doc_length,
                              self.to_stem, self.with_semantics)
     self.searcher.model = Word2Vec.load(self.posting_path + '//model.bin')
示例#12
0
    def __init__(self, numOfLayer):
        self.num = numOfLayer
        self.parent = []
        self.children = []
        self.handled = []
        self.Indexer = Indexer()
        self.Processor = Processor()
        self.Porter = PorterStemmer()
        self.db = []

        link = "http://www.cse.ust.hk/"
        self.parent.append(link)
示例#13
0
    def start(self):
        self.indexer = Indexer(self.posting_path)
        if self.to_stem:
            self.indexer.to_stem = True
        dirs_list = os.listdir(self.main_path + '\\corpus')
        # Create temp postings Multiprocessing
        dirs_dict = ParallelMain.start(self.main_path, self.posting_path,
                                       self.to_stem, dirs_list)
        # Merging dictionaries that were created by the processes
        docs = {}
        files_names = []
        post_files_lines = []
        total_length = 0
        for dir in dirs_dict.keys():
            tmp_docs_dict = dirs_dict[dir][2]
            for doc_id in tmp_docs_dict:
                docs[doc_id] = tmp_docs_dict[doc_id]
                total_length += docs[doc_id].length
            for lang in dirs_dict[dir][3]:
                self.languages.add(lang)
            old_post_files_lines = dirs_dict[dir][0]
            for i in range(0, len(old_post_files_lines)):
                files_names.append(dir + "\\Posting" +
                                   str(i) if not self.to_stem else dir +
                                   "\\sPosting" + str(i))
                post_files_lines.append(old_post_files_lines[i])

        self.avg_doc_length = total_length / len(docs)

        # Gets Cities that appear in the corpus
        i = 0
        while i < len(dirs_list):
            self.reader.read_cities(self.main_path + '\\corpus', dirs_list[i])
            i += 1

        terms_dicts = [
            dirs_dict["\\Postings1"][1], dirs_dict["\\Postings2"][1],
            dirs_dict["\\Postings3"][1], dirs_dict["\\Postings4"][1]
        ]

        terms_dict = Merge.start_merge(files_names, post_files_lines,
                                       terms_dicts, self.posting_path,
                                       self.to_stem)

        self.indexer.docs_avg_length = self.avg_doc_length
        self.indexer.terms_dict = terms_dict
        self.indexer.docs_dict = docs
        self.indexer.index_cities(self.reader.cities)
        self.indexer.post_pointers(self.languages)
示例#14
0
    def __init__(self):

        DBCrawl.connect()
        DBUnCrawl.connect()
        DBRobot.connect()
        DBWebPage.connect()
        DBPageRank.connect()
        DBIndexer.connect()
        indexedCount.connect()
        #DBQuery.connect()

        self._getDBTables()
        self.indexer = Indexer()
        self.numberOfThreads = 1
        self._setNumOfThreads()
        self.crawlerObjs = []
        self._createCrawlerObjects()
 def __init__(self, device):
     super(AbsolutePositionalEmbedding, self).__init__(device=device)
     self.max_length = 150
     self.indexer = Indexer(special_tokens={
         '<s>': 0,
         '<unk>': 1,
         '<pad>': 2,
         '<\s>': 3,
         '<mask>': 4
     },
                            with_del_stopwords=self.with_del_stopwords)
     self.indexer.add_sentence(list(map(str, range(self.max_length))),
                               with_raw=True)
     self.embedding_dim = 20
     self.embedding = nn.Embedding(num_embeddings=len(self.indexer),
                                   embedding_dim=self.embedding_dim,
                                   padding_idx=self.indexer.padding_index)
     self.embedding.to(device)
示例#16
0
 def __init__(self, device):
     super(NtuaTwitterEmbedding, self).__init__(device=device)
     self.path = Path('../data/models/ntua-slp-semeval2018/ntua_twitter_300.txt')
     with self.path.open('r', encoding='utf-8-sig') as f:
         texts = f.readlines()
     headers = texts[0].strip().split(' ')
     contents = [text.strip().split(' ') for text in texts[1:]]
     vocab = [content[0] for content in contents]
     weights = [list(map(float, content[1:])) for content in contents]
     self.indexer = Indexer(special_tokens={'<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4}, with_del_stopwords=self.with_del_stopwords)
     for word in vocab:
         self.indexer.count_word(word)
         self.indexer.add_word(word)
     self.embedding_dim = int(headers[1])
     special_weights = [[0.0] * self.embedding_dim] * 5
     weights = torch.FloatTensor(special_weights + weights)
     self.embedding = nn.Embedding.from_pretrained(embeddings=weights, padding_idx=self.indexer.padding_index)
     self.embedding.to(device)
示例#17
0
def main(argv):
    collectionFile = ''
    tokenizerType = ''
    try:
        opts, args = getopt.getopt(argv, "hf:t:",
                                   ["collectionFile=", "tokenizerType="])
    except getopt.GetoptError:
        print(
            'main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better>'
        )
        sys.exit()

    if len(opts) != 2:
        print(
            'main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better>'
        )
        sys.exit()

    for opt, arg in opts:
        if opt == '-h':
            print(
                'main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better>'
            )
            sys.exit()
        elif opt in ("-f", "--collectionFile"):
            if not path.exists(arg):
                print('Incorrect path to collection file.')
                sys.exit()
            elif not path.exists(arg):
                print('File doesn\'t exists')
                sys.exit()
            collectionFile = arg
        elif opt in ("-t", "--tokenizerType"):
            if arg != '0' and arg != '1':
                print(
                    'Incorrect tokenizer type. Simple tokenizer: 0, Better tokenizer: 1.'
                )
                sys.exit()
            tokenizerType = arg

    indexer = Indexer(collectionFile, tokenizerType)
    indexer.listTermsInOneDoc()
    indexer.listHighestDocFreqTerms()
示例#18
0
 def __init__(self, device):
     super(RawEmbedding, self).__init__(device=device)
     self.indexer = Indexer(special_tokens={
         '<s>': 0,
         '<unk>': 1,
         '<pad>': 2,
         '<\s>': 3,
         '<mask>': 4
     },
                            with_del_stopwords=self.with_del_stopwords)
     datasets = Dataset().get_instance()
     sentences = [pairs[0] for pairs in datasets['train']]
     self.indexer.count_word_in_text(sentences)
     self.indexer.add_sentences(sentences)
     self.embedding_dim = 100
     self.embedding = nn.Embedding(num_embeddings=len(self.indexer),
                                   embedding_dim=self.embedding_dim,
                                   padding_idx=self.indexer.padding_index)
     self.embedding.to(device)
示例#19
0
def probe_vocabs():
    datasets, tags = get_datasets()
    indexer = Indexer(with_preprocess=False)

    n_grams = [1, 2, 3]
    raw_texts = datasets
    multi_stats = {i: {
        'vocabs': {tag: set() for tag in tags},
        'counts': {tag: {} for tag in tags},
        'vocabs_by_labels': {tag: {'INFORMATIVE': set(), 'UNINFORMATIVE': set()} for tag in tags},
        'counts_by_labels': {tag: {'INFORMATIVE': {}, 'UNINFORMATIVE': {}} for tag in tags},
        'ann_texts': {tag: [] for tag in tags},
        'del_texts': {tag: [] for tag in tags}
    } for i in n_grams}

    del_items = set(
        ['<hashtag>', '</hashtag>', '<allcaps>', '</allcaps>', '<user>', 'covid19', 'coronavirus', 'covid',
         '<number>', 'httpurl', 19, '19'])
    del_items |= set(["'", '"', ':', ';', '.', ',', '-', '!', '?', "'s", "<", ">", "(", ")", "/"])
    del_items |= set(nltk_stopwords.words('english'))
    for n_gram in n_grams:
        for tag in tags:
            for text, label in datasets[tag]:
                words = indexer.text_processor.pre_process_doc(text)
                label = get_label_text(label)
                multi_stats[n_gram]['ann_texts'][tag].extend([['_'.join(words[i: i+n_gram]) for i in range(0, len(words) - n_gram + 1)]])
                del_words = [word for word in words if word not in del_items]
                multi_stats[n_gram]['del_texts'][tag].extend([['_'.join(del_words[i: i+n_gram]) for i in range(0, len(del_words) - n_gram + 1)]])
                if n_gram != 0:
                    words = del_words
                for word in ['_'.join(words[i: i+n_gram]) for i in range(0, len(words) - n_gram + 1)]:
                    multi_stats[n_gram]['vocabs'][tag].add(word)
                    multi_stats[n_gram]['vocabs_by_labels'][tag][label].add(word)
                    if word in multi_stats[n_gram]['counts'][tag].keys():
                        multi_stats[n_gram]['counts'][tag][word] += 1
                    else:
                        multi_stats[n_gram]['counts'][tag][word] = 1
                    if word in multi_stats[n_gram]['counts_by_labels'][tag][label].keys():
                        multi_stats[n_gram]['counts_by_labels'][tag][label][word] += 1
                    else:
                        multi_stats[n_gram]['counts_by_labels'][tag][label][word] = 1
    return {'multi_stats': multi_stats, 'raw_texts': raw_texts}
示例#20
0
def probe_sentence_length():
    datasets, tags = get_datasets()
    counts = {tag: {} for tag in tags}
    counts_by_labels = {tag: {'INFORMATIVE': {}, 'UNINFORMATIVE': {}} for tag in tags}
    indexer = Indexer()
    for tag in tags:
        for text, label in datasets[tag]:
            words = indexer.text_processor.pre_process_doc(text)
            label = get_label_text(label)
            if len(words) in counts[tag].keys():
                counts[tag][len(words)].append(words)
            else:
                counts[tag][len(words)] = [words]

            if len(words) in counts_by_labels[tag][label].keys():
                counts_by_labels[tag][label][len(words)] += 1
            else:
                counts_by_labels[tag][label][len(words)] = 1

    return {'counts': counts, 'counts_by_labels': counts_by_labels}
示例#21
0
    def __init__(self, device):
        super(StanfordTwitterEmbedding, self).__init__(device=device)
        self.path = Path(
            '../data/models/glove.twitter.27B/glove.twitter.27B.200d.txt')
        with_raw_file = False
        if with_raw_file:
            with self.path.open('r', encoding='utf-8-sig') as f:
                texts = f.readlines()
            headers = [len(texts), None]
            vocab, weights = map(
                list,
                zip(*Parallel(n_jobs=10)
                    ([delayed(self.get_weights)(text) for text in texts])))
            with (self.path.parent / 'vocab.pkl').open('wb') as f:
                pickle.dump(vocab, f)
            with (self.path.parent / 'weights.pkl').open('wb') as f:
                pickle.dump(weights, f)
        else:
            with (self.path.parent / 'vocab.pkl').open('rb') as f:
                vocab = pickle.load(f)
            with (self.path.parent / 'weights.pkl').open('rb') as f:
                weights = pickle.load(f)

        self.indexer = Indexer(special_tokens={
            '<s>': 0,
            '<unk>': 1,
            '<pad>': 2,
            '<\s>': 3,
            '<mask>': 4
        },
                               with_del_stopwords=self.with_del_stopwords)
        for word in vocab:
            self.indexer.count_word(word)
            self.indexer.add_word(word)
        self.embedding_dim = len(weights[0])
        special_weights = [[0.0] * self.embedding_dim] * 5
        weights = torch.FloatTensor(special_weights + weights)
        self.embedding = nn.Embedding.from_pretrained(
            embeddings=weights, padding_idx=self.indexer.padding_index)
        self.embedding.to(device)
示例#22
0
def main(inputDir, outputDir):

    m = Indexer()
    files = os.listdir(inputDir)

    #for measuring elapsed time
    elapsed_time = []
    start = time.time()

    #Loop over all files in the given directory
    for file in files:
        if file.endswith(".html"):
            inputFile = os.path.join(inputDir, file)

            m.parse(inputFile)  #the tokenization happens inside this function
            m.mappings.append((m.doc_num, inputFile))

            end = time.time()
            elapsed_time.append(end - start)

    m.writeFiles(outputDir, N=len(m.mappings))
    print("Ran in {} seconds.".format(elapsed_time[-1]))
示例#23
0
def main():
    indexer = Indexer()
    numDocs = 0

    for subdir, dirs, files in os.walk(
            r'C:\Users\Kevin Huynh\Projects\cs121-a3\DEV'
    ):  ## TODO: need to update to directory's DEV folder
        for filename in files:
            filepath = subdir + os.sep + filename
            f = open(filepath)
            data = json.load(f)
            print(data["url"])
            indexer.parse(data['content'], data['url'])
            numDocs += 1

    indexer.compute_tdidf()
    sortedTokens = sorted(indexer.invertedIndex.items(),
                          key=lambda x: x[1]["total_frequency"],
                          reverse=True)
    print("Number of Documents: {}".format(numDocs))
    print("Number of Unique Tokens: {}".format(
        len(indexer.invertedIndex.keys())))
    # file1 = open("index.txt", "a")
    for k, v in sortedTokens:
        postings = v["postings"]
        if len(k) < 2:
            filename = k + ".txt"
        else:
            filename = k[:2] + ".txt"

        file = open("indexes/" + filename, "a")

        sorted_postings = sorted(postings.items(),
                                 key=lambda x: x[1],
                                 reverse=True)

        file.write("{}:{}\n".format(k, sorted_postings))
        file.close()
示例#24
0
def main():
    indexer = Indexer()
    numDocs = 0

    for subdir, dirs, files in os.walk(
            r'C:\Users\Justin Ho\Documents\CS 121\developer\DEV'):
        for filename in files:
            filepath = subdir + os.sep + filename
            f = open(filepath)
            data = json.load(f)
            print(data["url"])
            indexer.parse(data['content'], data['url'])
            numDocs += 1

    sortedTokens = sorted(indexer.invertedIndex.items(),
                          key=lambda x: x[1]["tf-idf"],
                          reverse=True)
    print("Number of Documents: {}".format(numDocs))
    print("Number of Unique Tokens: {}".format(
        len(indexer.invertedIndex.keys())))
    file1 = open("index.txt", "a")
    for k, v in sortedTokens:
        file1.write("{}:{}\n".format(k, v))
    file1.close()
示例#25
0
from Model import *
from Indexer import Indexer

DB.connect()
"""RUN this only once when creating table!"""
#DB.create_tables([IndexerTable])

test = Indexer()

zidan = test.addToIndex("zidan", "http://www.ZidanMusk.com")
osmium = test.addToIndex("osmium", "http://www.Osmium.com")
abdo = test.addToIndex("abdo", "http://www.Abdo.com")

words = ["zidan", "osmium", "abdo", "musk"]
searchW = test.lookupWithWords(words)

pages = [
    "http://www.ZidanMusk.com", "http://www.Osmium.com",
    "http://www.Abdo.comNOOO"
]
searchP = test.lookupWithPages(pages)

print(searchW)
print(searchP)

DB.close()
    # Add a required original file name.
    parser.add_argument('original', help='The Original text file name.')

    # Add a required preprocesed file name.
    parser.add_argument('preprocessed',
                        help='The Preprocessed file for building the index.')

    # Add an optional map used argument. The 'dest' arg is how
    # it will be referred to inside the parser.
    parser.add_argument('--map',
                help='The requested multimap data structure.', default='avl')

    # Add an optional Index file name. The 'dest' arg is how
    # it will be referred to inside the parser.
    parser.add_argument('--index',
                        dest='index', help='The optional Index file name.')

    # If no input use supplied, that will be handled by argparse,
    # since it was a required argument.
    # Get the args for use.
    args = parser.parse_args()
    if args.index:
        myIndexer = Indexer(args.original,args.preprocessed,
                            args.map,args.index)
    else:
        myIndexer = Indexer(args.original,args.preprocessed,args.map)

    myIndexer.index()
    myIndexer.UserInterface()
示例#27
0
文件: crawl.py 项目: noxerit/cms
from Frontier import Frontier
from PageRanker import PageRanker
from Indexer import Indexer
from Searcher import Searcher
import re

frontier = Frontier()
pageRanker = PageRanker()
indexer = Indexer()

seedDocuments = [
    'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d01.html',
    'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d06.html',
    'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d08.html'
]


def printWebGraph(webGraph):
    print
    print '-*( Web Graph )*-'
    print
    for entry in sorted(webGraph.keys()):
        print entry + ' -> ' + ', '.join(webGraph[entry])


def printIndex(index):
    print
    print '-*( Indices )*-'
    print
    for term, occurences in sorted(index.iteritems()):
        print '(' + term[0] + ', df:' + str(term[1]) + ') ->',
示例#28
0
from Indexer import Indexer
import argparse
import sys

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Index words in '
        'a preprocessed document and searchs for requested keywords.')
    parser.add_argument('original', help='Orginial text document')
    parser.add_argument('preprocessed', help='Preprocessed text document')
    parser.add_argument('--index ', dest='indexed', help='Writes list of '
                        'index words into the given file on quit.')
    parser.add_argument('--map ', dest='map_type', help='Uses given'
                        'data structures to index words. Available options are'
                        'avl, unsorted, sorted, chain, probe, splay, rb, dict,'
                        ' and od.')
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    args = parser.parse_args()

    if args:
        indexer = Indexer(args.original, args.preprocessed, args.indexed,
                          args.map_type)
        indexer.index()
        print(indexer)
        indexer.startUI()
        indexer.dump()
    else:
        parser.print_help()
示例#29
0
 def run_batch(self, docs, batchNo, mapper):
     print "Running Batch %s" % batchNo
     indexer = Indexer(docs, mapper, self.catalogs, self.docLengths)
     indexer.index(keep_stopwords=True, stem=False)
示例#30
0
import Queue
reportQueue = Queue.Queue(200)
from HttpServer import HttpServerThread
import time
from Logger import *

if __name__=="__main__":
    log_info("start indexer ...")
    cfgFile="../conf/conf.yml"
    conf = RygConf.load(cfgFile)
    httpd = HttpServerThread(conf.center_host, conf.center_port, conf.mon_hosts, reportQueue)
    httpd.start()
    log_info("httpd thread started" )
    processors = list()
    for i in xrange(0, conf.threads):
        p = Indexer(reportQueue, conf)
        p.start()
        processors.append(p)
    log_info("processor threads started")
    #endless loop until terminated, do healthy check termly
    while True:
        if not httpd.isAlive():
            log_critical("HTTPD thread exited, terminate for restart")
            break
        errorFlag = False
        for p in processors:
            if not p.isAlive():
                errorFlag = True
                break
        if errorFlag:
            log_critical("ReportProcessor thread exited, terminate for restart")