Пример #1
0
 def endElement(self, tag):
     if tag == "id" and self.idFlag == 1:
         self.idFlag = 0
     elif tag == "title":
         config.id_title_map[config.page_count] = self.bufferTitle
         print("pg", config.page_count)
         process_title(self.bufferTitle)
     elif tag == "text":
         process_text(self.bufferText)
         create_index()
Пример #2
0
def crawl_and_index(base_urls, con, depth=2):
    indexed_urls = set()

    for i in range(depth):
        for base_url in base_urls:
            indexed_urls.add(base_url)
            links = get_valid_links(base_url)
            for link in links:
                if not link in indexed_urls:
                    create_index(link, con)
                    indexed_urls.add(link)

    return indexed_urls
Пример #3
0
def main():
    listFolder = os.listdir("DEV")  # gets list of sub folder names in DEV/
    urlNum = dict()
    i = 0
    for folder in listFolder:
        files = [
            f for f in listdir("DEV/" + folder)
            if isfile(join("DEV/" + folder, f))
        ]  # every file in var folder
        index = create_index(files, folder)  # one index per folder in DEV
        filename = "indexer" + str(i) + ".txt"
        fileObject = open(filename, 'wb')  # create partial index per subfolder
        pickle.dump(index.getDict(),
                    fileObject)  # dump the dictionary of the index into a file
        fileObject.close()
        urlNum[folder] = 0  # Number of urls from each folder (Just testing)
        i += 1
    pickle.dump(urlDict, open('docID_to_url.txt', 'wb'))
Пример #4
0
import json
import math
from tqdm import tqdm

f = open('config.json')
index_config = json.load(f)

if not indexer.elastic_search_is_running(index_config):
    print('Elastic search is not responding on ' + index_config['host'] + ':' + index_config['port'])
else:
    indexer.delete_index(index_config)

    if not indexer.index_exists(index_config):
        print('Index \"' + index_config['index'] + '\" was not found')
        print('Creating index \"' + index_config['index'] + '\"')
        indexer.create_index(index_config)


repos = find_sources.load_cache()[0:500]
for data in tqdm(repos):
    repo = data["url"]
    score = data["score"]

    target_dir = repository.clone_repo(repo)
    for file in repository.java_files(target_dir):
        # print("Indexing file " + file)

        try:
            methods = parse.find_methods(open(file).read())
        except javalang.parser.JavaSyntaxError:
            # print("Skipping {}, couldn't parse it".format(file))
Пример #5
0
from indexer import connect_elasticsearch, create_index, store_record, search
import logging
import json

property_index = 'wd_property'
entity_index = 'wd_entity'
property_doc_type = 'members'

with open('data/dump/property.json') as f:
    prop_datas = json.load(f)
prop_datas

logging.basicConfig(level=logging.ERROR)
es = connect_elasticsearch()
if (create_index(es, property_index) and create_index(es, entity_index)):
    count = 0
    for elem in prop_datas:
        prop_data = json.dumps(elem)

        if (not store_record(es, property_index, prop_data)):
            break
        else:
            count = count + 1
            if (count % 100 == 0):
                print("nums of data stored: {}".format(count))

    print("finish")
    print("total data stored: {}".format(count))