예제 #1
0
파일: server.py 프로젝트: medialab/hyphe
class TraphServerFactory(Factory):

    default_WECR = '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|))'
    WECRs = {
      's:http|h:com|h:world|': '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){1})'
    }

    def __init__(self, corpus, traph_dir="traph-data", default_WECR=None, WECRs=None):
        self.traph_dir = traph_dir
        self.corpus = corpus
        if not os.path.isdir(self.traph_dir):
            os.makedirs(self.traph_dir)
        self.traph = Traph(
          folder=os.path.join(self.traph_dir, corpus),
          default_webentity_creation_rule=default_WECR or self.default_WECR,
          webentity_creation_rules=WECRs or self.WECRs
        )

    def ready(self):
        # stdin message received by childprocess to know when traph is ready
        print "READY"

    def buildProtocol(self, addr):
        return TraphProtocol(self.traph)

    def close(self):
        self.traph.close()
예제 #2
0
class TraphServerFactory(Factory):

    default_WECR = '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|))'
    WECRs = {
        's:http|h:com|h:world|':
        '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){1})'
    }

    def __init__(self,
                 corpus,
                 traph_dir="traph-data",
                 default_WECR=None,
                 WECRs=None):
        self.traph_dir = traph_dir
        self.corpus = corpus
        if not os.path.isdir(self.traph_dir):
            os.makedirs(self.traph_dir)
        self.traph = Traph(folder=os.path.join(self.traph_dir, corpus),
                           default_webentity_creation_rule=default_WECR
                           or self.default_WECR,
                           webentity_creation_rules=WECRs or self.WECRs)

    def ready(self):
        # stdin message received by childprocess to know when traph is ready
        print "READY"

    def buildProtocol(self, addr):
        return TraphProtocol(self.traph)

    def close(self):
        self.traph.close()
예제 #3
0
 def __init__(self,
              corpus,
              traph_dir="traph-data",
              default_WECR=None,
              WECRs=None):
     self.traph_dir = traph_dir
     self.corpus = corpus
     if not os.path.isdir(self.traph_dir):
         os.makedirs(self.traph_dir)
     self.traph = Traph(folder=os.path.join(self.traph_dir, corpus),
                        default_webentity_creation_rule=default_WECR
                        or self.default_WECR,
                        webentity_creation_rules=WECRs or self.WECRs)
예제 #4
0
def main():
    """Место, где запускается программа"""

    data = Data.get_csv_data()
    if len(data) < 2:
        print('Неверый CSV файл')
        return
    if any([len(row) <= 13 for row in data]):
        print('Не все колонки имеются')
        return
    for i, row in enumerate(data[1:], start=2):
        # Заменить все кавычки на елочки и убрать лишние пробелы
        row = [replace_quotes(string.strip()) for string in row]

        kwargs = {
            'name': row[0],  # Полное имя
            'short_name': row[1],  # Наименование
            'what': row[2],  # Что за печенье
            'netto': row[3],  # Масса нетто
            'brutto': row[4],  # Масса брутто
            'proteins': row[5],  # Белки
            'fats': row[6],  # Жиры
            'carbohydrates': row[7],  # Углеводы
            'kkal': row[8],  # Килокалории
            'tu': row[9],  # TU
            'category': row[10],  # Категория
            'composition': row[11],  # Состав
            'shelf_life': row[12],  # Срок годности
            'code': row[13],  # Штрих код
        }
        if len(row) > 14:
            kwargs['bold_text'] = row[14]  # Текст жирным
        if len(row) > 15:
            kwargs['top_image'] = row[15]  # Картинка сверху

        if len(kwargs['category']) == 0:
            print('Категория на строке {} не заполнена.'.format(i))
            continue

        traph = Traph(**kwargs)
        traph.make_traph()

        # Очистим память
        del traph

    print('Программа завершила свою работу!')
예제 #5
0
파일: server.py 프로젝트: medialab/hyphe
 def __init__(self, corpus, traph_dir="traph-data", default_WECR=None, WECRs=None):
     self.traph_dir = traph_dir
     self.corpus = corpus
     if not os.path.isdir(self.traph_dir):
         os.makedirs(self.traph_dir)
     self.traph = Traph(
       folder=os.path.join(self.traph_dir, corpus),
       default_webentity_creation_rule=default_WECR or self.default_WECR,
       webentity_creation_rules=WECRs or self.WECRs
     )
예제 #6
0
    def get_traph(self, **kwargs):
        options = {
            'overwrite': False,
            'default_webentity_creation_rule': DEFAULT_WEBENTITY_CREATION_RULE,
            'webentity_creation_rules': WEBENTITY_CREATION_RULES,
            'folder': self.folder
        }

        options.update(kwargs)

        return Traph(**options)
예제 #7
0
    '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|))',
    'path1':
    '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){1})',
    'path2':
    '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){2})'
}

default_webentity_creation_rule = webentity_creation_rules_regexp['domain']

webentity_creation_rules = {
    's:http|h:com|h:twitter|': webentity_creation_rules_regexp['path1'],
}

# Creating the Traph
traph = Traph(folder='./scripts/data/',
              default_webentity_creation_rule=default_webentity_creation_rule,
              webentity_creation_rules=webentity_creation_rules)

webentities_network = traph.get_webentities_links()

# g = nx.Graph()

# for source, targets in webentities_network.items():
#     g.add_node(source, label=source)

#     for target in targets:
#         g.add_node(target, label=target)
#         g.add_edge(source, target)

# nx.write_gexf(g, './scripts/data/dump.gexf')
예제 #8
0
    'path2':
    '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){2})'
}

default_webentity_creation_rule = webentity_creation_rules_regexp['domain']

webentity_creation_rules = {
    's:http|h:com|h:twitter|': webentity_creation_rules_regexp['path1'],
    's:http|h:com|h:facebook|': webentity_creation_rules_regexp['path1'],
    's:http|h:com|h:linkedin|': webentity_creation_rules_regexp['path2']
}

webentity_store = WebEntityStore('./scripts/data/webentities.json')

traph = Traph(overwrite=True,
              folder='./scripts/data/',
              default_webentity_creation_rule=default_webentity_creation_rule,
              webentity_creation_rules=webentity_creation_rules)
trie = traph.lru_trie
links = traph.link_store

print trie.header
print links.header

for page in PAGES:
    traph.add_page(page)

traph.add_links(LINKS)

for source_lru, target_lru in traph.links_iter():
    print 'Source: %s, Target: %s' % (source_lru, target_lru)
예제 #9
0
}

default_webentity_creation_rule = webentity_creation_rules_regexp['domain']

webentity_creation_rules = {}

# Webentity store is necessary to keep track of web entities' prefixes.
# Though the traph could retrieve them, it would not be efficient.
# In a real situation, these would be tracked elsewhere.
# That's what we are simulating with this store.
webentity_store = WebEntityStore('./scripts/data/webentities.json')
webentity_store.data['webentities'] = {}

# Instanciate the traph
traph = Traph(overwrite=True,
              folder='./scripts/data/',
              default_webentity_creation_rule=default_webentity_creation_rule,
              webentity_creation_rules=webentity_creation_rules)

# Generate random pages
pages_count = 100
print '\n:: Generate %s lorem-ipsum-based pages' % (pages_count)
voc = ['lorem', 'ipsum', 'dolor', 'sit', 'amet', 'hodor', 'consectetur']
path_sizes = [1, 2, 3]
for i in range(pages_count):
    path_size = random.choice(path_sizes)
    protocol = 's:http|'
    tld = 'h:com|'
    host = 'h:%s|' % (random.choice(voc))
    path = ''
    for p in range(path_size):
        path += 'p:%s|' % (random.choice(voc))
예제 #10
0
}

default_webentity_creation_rule = webentity_creation_rules_regexp['domain']

webentity_creation_rules = {}

# Webentity store is necessary to keep track of web entities' prefixes.
# Though the traph could retrieve them, it would not be efficient.
# In a real situation, these would be tracked elsewhere.
# That's what we are simulating with this store.
webentity_store = WebEntityStore('./scripts/data/webentities.json')
webentity_store.data['webentities'] = {}

# Instanciate the traph
traph = Traph(overwrite=True,
              folder='./scripts/data/',
              default_webentity_creation_rule=default_webentity_creation_rule,
              webentity_creation_rules=webentity_creation_rules)

print '\n:: Setup'

print '- Create a "Twitter" webentity with the 4 prefix variations (WWW and HTTPS cases)'
twitter_prefixes = [
    's:http|h:com|h:twitter|', 's:http|h:com|h:twitter|h:www|',
    's:https|h:com|h:twitter|', 's:https|h:com|h:twitter|h:www|'
]
report = traph.create_webentity(twitter_prefixes)
webentity_store.data['webentities'].update(report.created_webentities)
twitter_weid = report.created_webentities.keys()[0]  # Used below

print '- Create a "Ego" webentity with ego.com (4 prefixes) as well as a Twitter account (additional 4 prefixes)'
ego_prefixes = [
from traph import Traph

traph = Traph(folder='./', debug=True)
trie = traph.lru_trie
link_store = traph.link_store

euronews_id = 342
euronews_prefixes = [
    's:https|h:com|h:euronews|h:fr|', 's:http|h:com|h:euronews|h:fr|',
    's:http|h:com|h:euronews|h:fr|h:www|',
    's:https|h:com|h:euronews|h:fr|h:www|'
]

linked_ids = set([96, 98, 299, 315])


def links_iter(weid, prefixes):
    for prefix in prefixes:
        starting_node = trie.lru_node(prefix)
        target_node = trie.node()

        for node, lru in trie.webentity_dfs_iter(starting_node, prefix):

            if not node.is_page():
                continue

            if node.has_outlinks():
                links_block = node.outlinks()

                for link_node in link_store.link_nodes_iter(links_block):
예제 #12
0
}

default_webentity_creation_rule = webentity_creation_rules_regexp['domain']

webentity_creation_rules = {}

# Webentity store is necessary to keep track of web entities' prefixes.
# Though the traph could retrieve them, it would not be efficient.
# In a real situation, these would be tracked elsewhere.
# That's what we are simulating with this store.
webentity_store = WebEntityStore('./scripts/data/webentities.json')
webentity_store.data['webentities'] = {}

# Instanciate the traph
traph = Traph(overwrite=True,
              folder='./scripts/data/',
              default_webentity_creation_rule=default_webentity_creation_rule,
              webentity_creation_rules=webentity_creation_rules)

# Step 1
print '\n:: Step 1 - Create a "Boeing" webentity with the 4 prefix variations (WWW and HTTPS cases).'
print 'Expected: Creates the entity with the 4 prefixes. This is the typical use case.'

boeing_prefixes = [
    's:http|h:com|h:boeing|', 's:http|h:com|h:boeing|h:www|',
    's:https|h:com|h:boeing|', 's:https|h:com|h:boeing|h:www|'
]
report = traph.create_webentity(boeing_prefixes)
webentity_store.data['webentities'].update(report.created_webentities)
boeing_weid = report.created_webentities.keys()[0]  # Used for a step below

print '\nResult - Existing webentities from Store:'
예제 #13
0
# Instanciate traph with a custom rule: split after 'world' (continents)
print '\n"Continents" rule given at traph init (continents should be entities)'
webentity_creation_rules = {
    's:http|h:com|h:world|': webentity_creation_rules_regexp['path1'],
}

# Webentity store is necessary to keep track of web entities' prefixes.
# Though the traph could retrieve them, it would not be efficient.
# In a real situation, these would be tracked elsewhere.
# That's what we are simulating with this store.
webentity_store = WebEntityStore('./scripts/data/webentities.json')
webentity_store.data['webentities'] = {}

# Instanciate the traph
traph = Traph(overwrite=True,
              folder='./scripts/data/',
              default_webentity_creation_rule=default_webentity_creation_rule,
              webentity_creation_rules=webentity_creation_rules)

# Step 1
print '\n:: Step 1: Add the "Madrid" page'
print 'Expected: "Europe" webentity created (matching the rule given at init), "World" not created'

report = traph.add_page('s:http|h:com|h:world|p:europe|p:spain|p:madrid|')
webentity_store.data['webentities'].update(report.created_webentities)

print '\nResult - Existing webentities:'
for weid, prefixes in webentity_store.data['webentities'].items():
    print ' - Webentity %s\t%s + %s other prefixes' % (weid, prefixes[0],
                                                       len(prefixes) - 1)

# Step 2
예제 #14
0
TRAPH_FOLDER = './sample-traph'
OUPUT = './youtube-inlinks.csv'

YOUTUBE_LRUS = [
    's:http|h:com|h:youtube|', 's:https|h:com|h:youtube|',
    's:http|h:com|h:youtube|h:www|', 's:https|h:com|h:youtube|h:www|',
    's:http|h:com|h:googleapis|h:youtube|',
    's:https|h:com|h:googleapis|h:youtube|',
    's:http|h:com|h:googleapis|h:youtube|h:www|',
    's:https|h:com|h:googleapis|h:youtube|h:www|', 's:http|h:be|h:youtu|',
    's:https|h:be|h:youtu|', 's:http|h:be|h:youtu|h:www|',
    's:https|h:be|h:youtu|h:www|'
]

traph = Traph(folder=TRAPH_FOLDER, debug=True)


def windup_lru(block):
    node = traph.lru_trie.node(block=block)

    lru = node.stem()
    webentity = node.webentity() if node.has_webentity() else None

    for parent in traph.lru_trie.node_parents_iter(node):
        lru = parent.stem() + lru

        if webentity is None and parent.has_webentity():
            webentity = parent.webentity()

    return lru, webentity
예제 #15
0
print 'Custom creation rules: "Hodor" is path-2 platform and "Lorem Ipsum" is path-1 platform'
webentity_creation_rules = {
    's:http|h:com|h:hodor|': webentity_creation_rules_regexp['path2'],
    's:http|h:com|h:lorem|h:ipsum|': webentity_creation_rules_regexp['path1'],
}

# Webentity store is necessary to keep track of web entities' prefixes.
# Though the traph could retrieve them, it would not be efficient.
# In a real situation, these would be tracked elsewhere.
# That's what we are simulating with this store.
webentity_store = WebEntityStore('./scripts/data/webentities.json')
webentity_store.data['webentities'] = {}

# Instanciate the traph
traph = Traph(overwrite=True,
              folder='./scripts/data/',
              default_webentity_creation_rule=default_webentity_creation_rule,
              webentity_creation_rules=webentity_creation_rules)


# LRU generation process
def random_lru(voc, domain_sizes, path_sizes):
    host_size = random.choice(domain_sizes)
    path_size = random.choice(path_sizes)
    protocol = 's:http|'
    tld = 'h:com|'
    host = ''
    for h in range(host_size):
        host += 'h:%s|' % (random.choice(voc))
    path = ''
    for p in range(path_size):
        path += 'p:%s|' % (random.choice(voc))
예제 #16
0
    '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|))',
    'path1':
    '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){1})',
    'path2':
    '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){2})'
}

default_webentity_creation_rule = webentity_creation_rules_regexp['domain']

webentity_creation_rules = {
    's:http|h:com|h:twitter|': webentity_creation_rules_regexp['path1'],
}

# Creating the Traph
traph = Traph(overwrite=True,
              folder='./scripts/data/',
              default_webentity_creation_rule=default_webentity_creation_rule,
              webentity_creation_rules=webentity_creation_rules)

# Reading from mongo
client = MongoClient(MONGO['host'], MONGO['port'])
collection = client[MONGO['db']][MONGO['collection']]


def links_generator(data):
    source = data['lru']

    for target in data['lrulinks']:
        yield source, target


links_multimap = defaultdict(list)
예제 #17
0
}

default_webentity_creation_rule = webentity_creation_rules_regexp['domain']

webentity_creation_rules = {}

# Webentity store is necessary to keep track of web entities' prefixes.
# Though the traph could retrieve them, it would not be efficient.
# In a real situation, these would be tracked elsewhere.
# That's what we are simulating with this store.
webentity_store = WebEntityStore('./scripts/data/webentities.json')
webentity_store.data['webentities'] = {}

# Instanciate the traph
traph = Traph(overwrite=True,
              folder='./scripts/data/',
              default_webentity_creation_rule=default_webentity_creation_rule,
              webentity_creation_rules=webentity_creation_rules)

print '\n:: Store network...'

use_index_batch_crawl = True

if use_index_batch_crawl:
    data = {}
    for source_lru, target_lru in LINKS:
        if source_lru in data:
            links = data[source_lru]
        else:
            links = []
        links.append(target_lru)
        data[source_lru] = links
예제 #18
0
}

default_webentity_creation_rule = webentity_creation_rules_regexp['domain']

webentity_creation_rules = {}

# Webentity store is necessary to keep track of web entities' prefixes.
# Though the traph could retrieve them, it would not be efficient.
# In a real situation, these would be tracked elsewhere.
# That's what we are simulating with this store.
webentity_store = WebEntityStore('./scripts/data/webentities.json')
webentity_store.data['webentities'] = {}

# Instanciate the traph
traph = Traph(overwrite=True,
              folder='./scripts/data/',
              default_webentity_creation_rule=default_webentity_creation_rule,
              webentity_creation_rules=webentity_creation_rules)

print '\n:: Simulate a crawl:'
print ' - Create webentity for "s:http|h:com|h:professor|p:augustine|p:sycamore|"'
professor_prefixes = [
    's:http|h:com|h:professor|p:augustine|p:sycamore|',
    's:http|h:com|h:professor|h:www|p:augustine|p:sycamore|',
    's:https|h:com|h:professor|p:augustine|p:sycamore|',
    's:https|h:com|h:professor|h:www|p:augustine|p:sycamore|'
]
report = traph.create_webentity(professor_prefixes)
webentity_store.data['webentities'].update(report.created_webentities)

print ' - Simulate page crawls with links to the list of target pages'